# for emacs: -*- mode: sh; -*- # Drosophila pseudoobscura -- # # Flybase release 1.0. Partly chrom-based, partly linkage-group based, # but at 17 sequences total that's nicer than 759 scaffolds. Also comes # with gene annotations (predicted genes and putative Drosophila homologs # based on tblastn). # # ftp://flybase.net/genomes/Drosophila_pseudoobscura/current/dpse_release1.0.txt # # NOTE: this doc may have genePred loads that fail to include # the bin column. Please correct that for the next build by adding # a bin column when you make any of these tables: # # mysql> SELECT tableName, type FROM trackDb WHERE type LIKE "%Pred%"; # +-------------+---------------------------------+ # | tableName | type | # +-------------+---------------------------------+ # | flyBaseGene | genePred flyBasePep | # | xenoRefGene | genePred xenoRefPep xenoRefMrna | # | twinscan | genePred twinscanPep | # | genewise | genePred genewisePep | # | geneid | genePred geneidPep | # | genscan | genePred genscanPep | # +-------------+---------------------------------+ # DOWNLOAD SEQUENCE (DONE 11/15/04 angie) ssh kksilo mkdir /cluster/store8/dp3 cd /cluster/data ln -s /cluster/store8/dp3 dp3 cd /cluster/data/dp3 mkdir jkStuff bed mkdir downloads cd downloads wget ftp://flybase.net/genomes/Drosophila_pseudoobscura/current/fasta/dpse-all-chromosome-r1.03.fasta.gz zcat dpse-all-chromosome-r1.03.fasta.gz | faSize stdin #141079253 bases (6504941 N's 134574312 real) in 17 sequences in 1 files #Total size: mean 8298779.6 sd 7909506.4 min 374547 (XL_group3b) max 30711475 (2) median 6604477 #N count: mean 382643.6 sd 405982.0 zcat dpse-all-chromosome-r1.03.fasta.gz \ | sed -e 's/^>/>chr/' \ | faSplit byName stdin stubArg # Put chromosome files in our usual $c/$chr.fa hierarchy: foreach f (chr*.fa) set chr = $f:r set c = `echo $chr | sed -e 's/^chr//; s/_group.*$//;'` echo $chr to $c mkdir -p ../$c mv $f ../$c end # BREAK UP SEQUENCE INTO 5 MB CHUNKS AT CHUNKS OF N'S (DONE 11/15/04 angie) # Since we didn't get real AGP with gap info, we don't have enough info # to help us break into ~5MB chunks at proper gap boundaries. So just # use runs of N's as gaps. ssh kksilo cd /cluster/data/dp3 foreach f (?{,?}/chr*.fa) set chr = $f:t:r set c = $f:h faSplit gap -minGapSize=100 -lift=$c/$chr.lft \ $c/$chr.fa 5000000 $c/${chr}_ end # put chunks into usual "contig" dir structure foreach ctg (?{,?}/chr*_?{,?}.fa) mv $ctg $ctg:r/ end # make liftAll.lft: cat ?{,?}/chr*.lft > jkStuff/liftAll.lft # RUN REPEAT MASKER (DONE 11/15/04 angie) # January ("March") '04 version of RepeatMasker and libs. #- Split sequence into 500kb chunks, at gaps if possible: ssh kksilo cd /cluster/data/dp3 foreach c (?{,?}) foreach d ($c/chr*_?{,?}) cd $d echo "splitting $d" set contig = $d:t faSplit gap $contig.fa 500000 ${contig}_ -lift=$contig.lft \ -minGapSize=100 cd ../.. end end # make the run directory, output directory, and job list cat << '_EOF_' > jkStuff/RMDrosophila #!/bin/csh -fe cd $1 /bin/mkdir -p /tmp/dp3/$2 /bin/cp $2 /tmp/dp3/$2/ pushd /tmp/dp3/$2 /cluster/bluearc/RepeatMasker/RepeatMasker -s -spec drosophila $2 popd /bin/cp /tmp/dp3/$2/$2.out ./ /bin/rm -fr /tmp/dp3/$2/* /bin/rmdir --ignore-fail-on-non-empty /tmp/dp3/$2 /bin/rmdir --ignore-fail-on-non-empty /tmp/dp3 '_EOF_' # << this line makes emacs coloring happy chmod +x jkStuff/RMDrosophila mkdir RMRun RMOut cp /dev/null RMRun/RMJobs foreach d (?{,?}/chr*_?{,?}) foreach f ( $d/chr*_?{,?}_?{,?}.fa ) set chunk = $f:t echo ../jkStuff/RMDrosophila \ /cluster/data/dp3/$d $chunk \ '{'check in line+ /cluster/data/dp3/$f'}' \ '{'check out line+ /cluster/data/dp3/$f.out'}' \ >> RMRun/RMJobs end end # do the run ssh kk cd /cluster/data/dp3/RMRun para create RMJobs para try, check, push, check,... #Completed: 379 of 379 jobs #Average job time: 4774s 79.57m 1.33h 0.06d #Longest job: 6961s 116.02m 1.93h 0.08d #Submission to last job: 7079s 117.98m 1.97h 0.08d #- Lift up the 500KB chunk .out's to 5MB ("pseudo-contig") level ssh kksilo cd /cluster/data/dp3 foreach d (*/chr*_?{,?}) set chunk = $d:t echo $chunk liftUp $d/$chunk.fa.out $d/$chunk.lft warn $d/${chunk}_*.fa.out \ > /dev/null end #- Lift pseudo-contigs to chromosome level foreach f (?{,?}/chr*.fa) set chr = $f:t:r echo lifting $chr cd $f:h liftUp $chr.fa.out $chr.lft warn \ `awk '{print $2 "/" $2 ".fa.out";}' $chr.lft` \ > /dev/null cd .. end # Load the .out files into the database with: ssh hgwdev hgLoadOut dp3 /cluster/data/dp3/*/chr*.fa.out featureBits dp3 rmsk #7517385 bases of 134584124 (5.586%) in intersection # Compare to previous assembly: featureBits dp2 rmsk #6707169 bases of 129770350 (5.168%) in intersection # CREATING DATABASE (DONE 11/15/04 angie) # Create the database. ssh hgwdev # Make sure there is at least 5 gig free for the database df -h /var/lib/mysql #/dev/sdc1 1.8T 646G 1014G 39% /var/lib/mysql hgsql '' -e 'create database dp3' # Copy all the data from the table "grp" # in an existing database to the new database hgsql dp3 -e 'create table grp (PRIMARY KEY(NAME)) select * from hg17.grp' # SIMPLE REPEATS (TRF) (DONE 11/15/04 angie) ssh kksilo mkdir -p /cluster/data/dp3/bed/simpleRepeat cd /cluster/data/dp3/bed/simpleRepeat mkdir trf cp /dev/null jobs.csh foreach d (/cluster/data/dp3/?{,?}/chr*_?{,?}) set f = $d/$d:t.fa set fout = $f:t:r.bed echo "trfBig -trf=/cluster/bin/i386/trf $f /dev/null -bedAt=trf/$fout -tempDir=/tmp" \ >> jobs.csh end csh -fe jobs.csh \ |& egrep -v '^(Removed|Tandem|Copyright|Loading|Allocating|Initializing|Computing|Scanning|Freeing)' \ > jobs.log & # check on this with tail -f jobs.log wc -l jobs.csh ls -1 trf | wc -l # 55 # Lift chunk results to one file for all chroms: liftUp simpleRepeat.bed ../../jkStuff/liftAll.lft warn trf/*.bed > /dev/null # Load this into the database as so ssh hgwdev hgLoadBed dp3 simpleRepeat \ /cluster/data/dp3/bed/simpleRepeat/simpleRepeat.bed \ -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql featureBits dp3 simpleRepeat #3280684 bases of 134584124 (2.438%) in intersection # Compare to previous assembly: featureBits dp2 simpleRepeat #3105979 bases of 129770350 (2.393%) in intersection # FILTER SIMPLE REPEATS (TRF) INTO MASK (DONE 11/15/04 angie) # make a filtered version of the trf output: # keep trf's with period <= 12: ssh kksilo cd /cluster/data/dp3/bed/simpleRepeat mkdir trfMask foreach f (trf/*.bed) awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t end mkdir trfMaskChrom foreach f (../../?{,?}/chr*.fa) set chr = $f:t:r liftUp trfMaskChrom/$chr.bed ../../jkStuff/liftAll.lft warn \ trfMask/$chr*.bed end # MASK FA USING REPEATMASKER AND FILTERED TRF FILES (DONE 11/15/04 angie) ssh kksilo cd /cluster/data/dp3 echo repeat- and trf-masking chunks foreach d (?{,?}/chr*_?{,?}) set f = $d/$d:t.fa maskOutFa -soft $f $f.out $f maskOutFa -softAdd $f bed/simpleRepeat/trfMask/$d:t.bed $f end echo repeat- and trf-masking chromosomes foreach f (?{,?}/chr*.fa) maskOutFa -soft $f $f.out $f maskOutFa -softAdd $f bed/simpleRepeat/trfMaskChrom/$f:t:r.bed $f end # STORE SEQUENCE AND ASSEMBLY INFORMATION (DONE 11/15/04 angie) ssh kksilo cd /cluster/data/dp3 # Make a .2bit file: faToTwoBit ?{,?}/chr*.fa dp3.2bit # Make chromInfo.tab. mkdir bed/chromInfo twoBitInfo dp3.2bit stdout \ | awk '{printf "%s\t%s\t/gbdb/dp3/dp3.2bit\n", $1, $2;}' \ > bed/chromInfo/chromInfo.tab # Make symbolic links from /gbdb/dp3 to the 2bit. ssh hgwdev mkdir -p /gbdb/dp3 ln -s /cluster/data/dp3/dp3.2bit /gbdb/dp3/ # Load up chromInfo table hgsql dp3 < $HOME/kent/src/hg/lib/chromInfo.sql hgsql dp3 -e 'load data local infile \ "/cluster/data/dp3/bed/chromInfo/chromInfo.tab" into table chromInfo' hgsql dp3 -N -e 'select chrom,size from chromInfo' \ > /cluster/data/dp3/chrom.sizes wc -l /cluster/data/dp3/chrom.sizes # 17 /cluster/data/dp3/chrom.sizes # MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE (DONE 11/16/04 angie) # Warning: genome and organism fields must correspond # with defaultDb values echo 'INSERT INTO dbDb \ (name, description, nibPath, organism, \ defaultPos, active, orderKey, genome, scientificName, \ htmlPath, hgNearOk, hgPbOk, sourceName) values \ ("dp3", "Nov. 2004", "/gbdb/dp3", "D. pseudoobscura", \ "chr4_group3:5603001-5614000", 1, 56, \ "D. pseudoobscura", \ "Drosophila pseudoobscura", "/gbdb/dp3/html/description.html", \ 0, 0, "FlyBase Release 1.03");' \ | hgsql -h genome-testdb hgcentraltest echo 'update defaultDb set name = "dp3" where genome = "D. pseudoobscura"' \ | hgsql -h genome-testdb hgcentraltest # Make trackDb table so browser knows what tracks to expect: ssh hgwdev cd ~/kent/src/hg/makeDb/trackDb cvs up -d -P # Edit trackDb/makefile to add dp3 to the DBS variable. mkdir drosophila/dp3 # Create a simple drosophila/dp3/description.html file. cvs add drosophila/dp3 cvs add drosophila/dp3/description.html make update DBS=dp3 ZOO_DBS= # go public on genome-test cvs ci makefile cvs ci drosophila/dp3 mkdir /gbdb/dp3/html # in a clean, updated tree's kent/src/hg/makeDb/trackDb: make alpha # EXTRACTING GAP INFO FROM BLOCKS OF NS (DONE 11/15/04 angie) ssh kksilo cd /cluster/data/dp3 cat ?{,?}/chr*.fa \ | faGapSizes stdin \ -niceSizes=5,10,20,25,30,40,50,100,250,500,1000,10000,100000 # None of those round numbers seem overrepresented, so just use # hgFakeAgp's default -minContigGap of 25. foreach f (?{,?}/chr*.fa) hgFakeAgp $f stdout \ | awk '$5 == "N" {print;}' > $f:r.gap end ssh hgwdev hgLoadGap dp3 /cluster/data/dp3 # PRODUCING GENSCAN PREDICTIONS (DONE 11/16/04 angie) # Run on small cluster -- genscan needs big mem. ssh kksilo mkdir /cluster/data/dp3/bed/genscan cd /cluster/data/dp3/bed/genscan # Check out hg3rdParty/genscanlinux to get latest genscan: cvs co hg3rdParty/genscanlinux # Make 3 subdirectories for genscan to put their output files in mkdir gtf pep subopt # Make hard-masked chunks foreach d (/cluster/data/dp3/?{,?}/chr*_?{,?}) set f = $d/$d:t.fa maskOutFa $f hard $f.masked end # Generate a list file, scaffolds.list, of all the hard-masked scaffolds # that *do not* consist of all-N's (which would cause genscan to blow up) cp /dev/null contigs.list foreach d (/cluster/data/dp3/?{,?}/chr*_?{,?}) set f = $d/$d:t.fa.masked egrep '[ACGT]' $f > /dev/null if ($status == 0) echo $f >> contigs.list end cat << '_EOF_' > gsub #LOOP gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000 #ENDLOOP '_EOF_' # << this line keeps emacs coloring happy gensub2 contigs.list single gsub jobList ssh kki cd /cluster/data/dp3/bed/genscan para create jobList para try, check, push, check, ... #Completed: 54 of 55 jobs #Crashed: 1 jobs #Average job time: 95s 1.58m 0.03h 0.00d #Longest job: 232s 3.87m 0.06h 0.00d #Submission to last job: 431s 7.18m 0.12h 0.00d # If there are crashes, diagnose with "para problems". # If a job crashes due to genscan running out of memory, re-run it # manually with "-window=1200000" instead of "-window=2400000". # OK, 1200000 ran out of mem too... try 1000000. ssh kolossus cd /cluster/data/dp3/bed/genscan gsBig /cluster/data/dp3/XR/chrXR_group8_1/chrXR_group8_1.fa.masked gtf/chrXR_group8_1.fa.gtf -trans=pep/chrXR_group8_1.fa.pep -subopt=subopt/chrXR_group8_1.fa.bed -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=1000000 # Convert these to chromosome level files as so: ssh kksilo cd /cluster/data/dp3/bed/genscan liftUp genscan.gtf ../../jkStuff/liftAll.lft warn gtf/*.gtf liftUp genscanSubopt.bed ../../jkStuff/liftAll.lft warn subopt/*.bed cat pep/*.pep > genscan.pep # Load into the database as so: ssh hgwdev cd /cluster/data/dp3/bed/genscan ldHgGene -gtf dp3 genscan genscan.gtf hgPepPred dp3 generic genscanPep genscan.pep hgLoadBed dp3 genscanSubopt genscanSubopt.bed # MYTOUCH FIX - jen - 2006-01-24 sudo mytouch dp3 genscanPep 0501071300.00 # PUT SEQUENCE ON /ISCRATCH FOR BLASTZ (DONE 11/15/04 angie) # OK, for this we still need nibs so build those... ssh kksilo cd /cluster/data/dp3 mkdir nib foreach f (*/chr*.fa) faToNib -softMask $f nib/$f:t:r.nib end ssh kkr1u00 mkdir /iscratch/i/dp3 cp -pR /cluster/data/dp3/nib /iscratch/i/dp3/ iSync # MAKE DOWNLOADABLE FILES (DONE 11/16/04 angie) ssh kksilo mkdir /cluster/data/dp3/zips cd /cluster/data/dp3 # Oops, need to make hard-masked chr*.fa.masked before we can zip them: foreach f (*/chr*.fa) maskOutFa $f hard $f.masked end zip -j zips/chromOut.zip */chr*.fa.out zip -j zips/chromFa.zip */chr*.fa zip -j zips/chromFaMasked.zip */chr*.fa.masked cd bed/simpleRepeat zip ../../zips/chromTrf.zip trfMaskChrom/chr*.bed cd ../.. foreach f (zips/*.zip) echo $f unzip -t $f | tail -1 end ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/dp3 cd /usr/local/apache/htdocs/goldenPath/dp3 mkdir bigZips database # Create README.txt files in bigZips/ and database/ to explain the files. cd bigZips cp -p /cluster/data/dp3/zips/*.zip . md5sum *.zip > md5sum.txt # SWAP DM1-DP3 BLASTZ (DONE 11/16/04 angie) ssh kksilo mkdir -p /cluster/data/dp3/bed/blastz.dm1.swap.2004-11-16/axtChrom ln -s blastz.dm1.swap.2004-11-16 /cluster/data/dp3/bed/blastz.dm1 cd /cluster/data/dp3/bed/blastz.dm1 set aliDir = /cluster/data/dm1/bed/blastz.dp3 cp $aliDir/S1.len S2.len cp $aliDir/S2.len S1.len mkdir unsorted axtChrom cat $aliDir/axtChrom/chr*.axt \ | axtSwap stdin $aliDir/S1.len $aliDir/S2.len stdout \ | axtSplitByTarget stdin unsorted # Sort the shuffled .axt files. foreach f (unsorted/*.axt) echo sorting $f:t:r axtSort $f axtChrom/$f:t end du -sh $aliDir/axtChrom unsorted axtChrom #296M /cluster/data/dm1/bed/blastz.dp3/axtChrom #297M unsorted #296M axtChrom rm -r unsorted # CHAIN MELANOGASTER BLASTZ (DONE 11/17/04 angie) # Run axtChain on little cluster ssh kki cd /cluster/data/dp3/bed/blastz.dm1 mkdir -p axtChain/run1 cd axtChain/run1 mkdir chain ls -1S /cluster/data/dp3/bed/blastz.dm1/axtChrom/*.axt \ > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy cat << '_EOF_' > doChain #!/bin/csh -ef axtChain -verbose=0 $1 \ /iscratch/i/dp3/nib \ /iscratch/i/dm1/nib stdout \ | chainAntiRepeat /iscratch/i/dp3/nib /iscratch/i/dm1/nib stdin $2 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain gensub2 input.lst single gsub jobList para create jobList para try, check, push, check... #Completed: 17 of 17 jobs #Average job time: 10s 0.17m 0.00h 0.00d #Longest job: 18s 0.30m 0.01h 0.00d #Submission to last job: 26s 0.43m 0.01h 0.00d # now on the fileserver, sort chains ssh kksilo cd /cluster/data/dp3/bed/blastz.dm1/axtChain chainMergeSort run1/chain/*.chain > all.chain chainSplit chain all.chain rm run1/chain/*.chain # Load chains into database ssh hgwdev cd /cluster/data/dp3/bed/blastz.dm1/axtChain/chain cat *.chain | hgLoadChain -tIndex dp3 chainDm1 stdin # NET MELANOGASTER BLASTZ (DONE 11/17/04 angie) ssh kksilo cd /cluster/data/dp3/bed/blastz.dm1/axtChain chainPreNet all.chain ../S1.len ../S2.len stdout \ | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \ | netSyntenic stdin noClass.net # Add classification info using db tables: ssh hgwdev cd /cluster/data/dp3/bed/blastz.dm1/axtChain netClass -noAr noClass.net dp3 dm1 melanogaster.net # Make a 'syntenic' subset: ssh kksilo cd /cluster/data/dp3/bed/blastz.dm1/axtChain rm noClass.net netFilter -syn melanogaster.net > melanogasterSyn.net # Load the nets into database ssh hgwdev cd /cluster/data/dp3/bed/blastz.dm1/axtChain netFilter -minGap=10 melanogaster.net | hgLoadNet dp3 netDm1 stdin netFilter -minGap=10 melanogasterSyn.net \ | hgLoadNet dp3 netSyntenyDm1 stdin # MAKE AXTNET (DONE 11/17/04 angie) ssh kksilo cd /cluster/data/dp3/bed/blastz.dm1/axtChain netSplit melanogaster.net net cd .. mkdir axtNet foreach f (axtChain/net/*) set chr = $f:t:r netToAxt $f axtChain/chain/$chr.chain /cluster/data/dp3/nib \ /cluster/data/dm1/nib stdout \ | axtSort stdin axtNet/$chr.axt end # MAKE VSDM1 DOWNLOADABLES (DONE 11/17/04 angie) ssh kksilo mkdir /cluster/data/dp3/zips/axtNet gzip -c \ /cluster/data/dp3/bed/blastz.dm1/axtChain/all.chain \ > /cluster/data/dp3/zips/melanogaster.chain.gz gzip -c \ /cluster/data/dp3/bed/blastz.dm1/axtChain/melanogaster.net \ > /cluster/data/dp3/zips/melanogaster.net.gz foreach f (/cluster/data/dp3/bed/blastz.dm1/axtNet/*.axt) gzip -c $f > /cluster/data/dp3/zips/axtNet/$f:t.gz end ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/dp3/vsDm1 cd /usr/local/apache/htdocs/goldenPath/dp3/vsDm1 mv /cluster/data/dp3/zips/mel*.gz . mv /cluster/data/dp3/zips/axtNet . md5sum *.gz */*.gz > md5sum.txt # Make a README.txt which explains the files & formats. # MAKE 11.OOC FILE FOR BLAT (DONE 11/16/04 angie) # Use -repMatch=75 (based on size -- for human we use 1024, and # mosquito size is ~5% of human => 50 -- but I worry it doesn't scale # linearly so be conservative and require more matches to be "overused") ssh kkr1u00 mkdir /cluster/data/dp3/bed/ooc cd /cluster/data/dp3/bed/ooc ls -1 /cluster/data/dp3/nib/*.nib > nib.lst mkdir /cluster/bluearc/dp3 blat nib.lst /dev/null /dev/null -tileSize=11 \ -makeOoc=/cluster/bluearc/dp3/11.ooc -repMatch=75 #Wrote 6790 overused 11-mers to /cluster/bluearc/dp3/11.ooc cp -p /cluster/bluearc/dp3/*.ooc /iscratch/i/dp3/ iSync # AUTO UPDATE GENBANK MRNA RUN (DONE 1/28/05 angie) ssh hgwdev # Update genbank config and source in CVS: cd ~/kent/src/hg/makeDb/genbank cvsup . # Edit etc/genbank.conf and add these lines: # dp3 (D. pseudoobscura) dp3.genome = /iscratch/i/dp3/nib/*.nib dp3.lift = /cluster/data/dp3/jkStuff/liftAll.lft dp3.refseq.mrna.native.load = no dp3.refseq.mrna.xeno.load = yes dp3.refseq.mrna.xeno.pslReps = -minCover=0.15 -minAli=0.75 -nearTop=0.005 dp3.genbank.mrna.xeno.load = yes dp3.genbank.est.xeno.load = no dp3.downloadDir = dp3 cvs ci etc/genbank.conf # Since D. pseudoobscura is not a new species for us, # don't need to edit src/lib/gbGenome.c. # Edit src/align/gbBlat to add /iscratch/i/dp3/11.ooc cvs diff src/align/gbBlat make cvs ci src/align/gbBlat # Install to /cluster/data/genbank: make install-server ssh eieio cd /cluster/data/genbank # This is an -initial run, (xeno) RefSeq only: nice bin/gbAlignStep -srcDb=refseq -type=mrna -initial dp3 & tail -f [its logfile] # Load results: ssh hgwdev cd /cluster/data/genbank nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad dp3 featureBits dp3 xenoRefGene #14849904 bases of 134584124 (11.034%) in intersection # Clean up: rm -rf work/initial.dp3 # This is an -initial run, mRNA only: nice bin/gbAlignStep -srcDb=genbank -type=mrna -initial dp3 & tail -f [its logfile] # Load results: ssh hgwdev cd /cluster/data/genbank nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad dp3 featureBits dp3 mrna #28204 bases of 134584124 (0.021%) in intersection # Still pretty paltry, but enough so that it would be a shame to not # have them. featureBits dp3 xenoMrna #16380659 bases of 134584124 (12.171%) in intersection # Clean up: rm -rf work/initial.dp3 # ADDED 4/22/05 # This is an -initial run, EST only: nice bin/gbAlignStep -srcDb=genbank -type=est -initial dp3 & tail -f [its logfile] # Load results: ssh hgwdev cd /cluster/data/genbank nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad dp3 featureBits dp3 intronEst #154814 bases of 134584124 (0.115%) in intersection featureBits dp3 est #235099 bases of 134584124 (0.175%) in intersection # Clean up: rm -rf work/initial.dp3 # MAKE GCPERCENT (DONE 11/15/04 angie) ssh hgwdev mkdir /cluster/data/dp3/bed/gcPercent cd /cluster/data/dp3/bed/gcPercent # create and load gcPercent table hgGcPercent dp3 ../../nib # MAKE HGCENTRALTEST BLATSERVERS ENTRY (TODO 11/?/04 angie) ssh hgwdev echo 'insert into blatServers values("dp3", "blat?", "177??", 1, 0); \ insert into blatServers values("dp3", "blat?", "177??", 0, 1);' \ | hgsql -h genome-testdb hgcentraltest # MAKE Drosophila Proteins track (DONE braney 11/19/04) ssh kksilo mkdir -p /cluster/data/dp3/blastDb cd /cluster/data/dp3/blastDb ln -s ../*/*/*.fa . for i in *.fa; do formatdb -i $i -p F 2> /dev/null; done rm *.fa *.log ssh kkr1u00 mkdir -p /iscratch/i/dp3/blastDb cp /cluster/data/dp3/blastDb/* /iscratch/i/dp3/blastDb (iSync) 2>&1 > sync.out mkdir -p /cluster/data/dp3/bed/tblastn.dm1FB cd /cluster/data/dp3/bed/tblastn.dm1FB ls -1S /iscratch/i/dp3/blastDb/*.nsq | sed "s/\.nsq//" > bug.lst exit # back to kksilo cd /cluster/data/dp3/bed/tblastn.dm1FB mkdir fbfa # calculate a reasonable number of jobs calc `wc /cluster/data/dm1/bed/blat.dm1FB/dm1FB.psl | awk "{print \\\$1}"`/\(150000/`wc bug.lst | awk "{print \\\$1}"`\) # 18735/(150000/434) = 54.206600 split -l 54 /cluster/data/dm1/bed/blat.dm1FB/dm1FB.psl fbfa/fb cd fbfa for i in *; do pslxToFa $i $i.fa; rm $i; done cd .. ls -1S fbfa/*.fa > fb.lst mkdir blastOut for i in `cat fb.lst`; do mkdir blastOut/`basename $i .fa`; done cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/iscratch/i/blast/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -nohead $f.3 subChr.lft warn $f.2 liftUp -nosort -type=".psl" -nohead $f.4 ../../jkStuff/liftAll.lft warn $f.3 liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /iscratch/i/dm1/protein.lft warn $f.4 mv $3.tmp $3 rm -f $f.1 $f.2 exit 0 fi fi rm -f $f.1 $f.2 $3.tmp exit 1 '_EOF_' chmod +x blastSome gensub2 bug.lst fb.lst blastGsub blastSpec ssh kk cd /cluster/data/dp3/bed/tblastn.dm1FB para create blastSpec para try, push # Completed: 144699 of 144699 jobs # CPU time in finished jobs: 1808325s 30138.75m 502.31h 20.93d 0.057 y # IO & Wait Time: 550220s 9170.33m 152.84h 6.37d 0.017 y # Average job time: 16s 0.27m 0.00h 0.00d # Longest job: 1069s 17.82m 0.30h 0.01d # Submission to last job: 25184s 419.73m 7.00h 0.29d cat << '_EOF_' > chainGsub #LOOP chainSome $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainSome (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=25000 stdin ../c.`basename $1`.psl) '_EOF_' chmod +x chainSome ls -1dS `pwd`/blastOut/fb?? > chain.lst gensub2 chain.lst single chainGsub chainSpec para create chainSpec # should run this on the mini-cluster or with my shove script # so you can limit the number of jobs starting to 3 or 4 para try, push... # Completed: 347 of 347 jobs # CPU time in finished jobs: 11953s 199.21m 3.32h 0.14d 0.000 y # IO & Wait Time: 4385s 73.09m 1.22h 0.05d 0.000 y # Average job time: 47s 0.78m 0.01h 0.00d # Longest job: 1500s 25.00m 0.42h 0.02d # Submission to last job: 2224s 37.07m 0.62h 0.03d exit # back to kksilo cd /cluster/data/dp3/bed/tblastn.dm1FB/blastOut for i in fb?? do awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.psl > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done cat u.*.psl m60* | sort -T /tmp -k 14,14 -k 16,16n -k 17,17n | uniq > ../blastDm1FB.psl ssh hgwdev cd /cluster/data/dp3/bed/tblastn.dm1FB hgLoadPsl dp3 blastDm1FB.psl # End tblastn # FLYBASE ANNOTATIONS (DONE 1/12/05 angie) ssh kksilo mkdir /cluster/data/dp3/bed/flybase cd /cluster/data/dp3/bed/flybase foreach f (../../?{,?}/chr*.fa) set c = `echo $f:t:r | sed -e 's/^chr//'` wget ftp://flybase.net/genomes/Drosophila_pseudoobscura/current/gff/dpse-$c-r1.03.gff.gz end zcat *.gff.gz > flybase.gff3 # What data sources are represented in this file? grep -v '^#' flybase.gff3 | awk '{print $2 "\t" $3;}' | sort | uniq -c # 9946 . CDS # 17 . chromosome # 2 . chromosome_arm # 120844 . exon # 12197 . gene # 15 . golden_path # 2650 . golden_path_fragment # 30310 . intron # 9946 . mRNA # 12101 . orthologous_region # 2905 . scaffold # 1170 . syntenic_region # 27838 blastn:na_dbEST.dpse match # 34576 blastz match # 18179 genewise gene # 18179 genewise mRNA # 67141 genewise match # 8327 genewise match_part # 17182 genscan gene # 17182 genscan mRNA # 46479 genscan match # 4847 genscan match_part # 18330 twinscan gene # 18330 twinscan mRNA # 45317 twinscan match # 4603 twinscan match_part # What keywords are defined in the 9th field? grep -v '^#' flybase.gff3 \ | awk '{print $9;}' | perl -wpe 's/=[^;]+;/\n/g; s/=.*$//;' \ | sort | uniq -c # 39402 Dbxref [gene, mRNA, CDS, intron only] # 548613 ID [everybody has this one.] # 76012 Name [gene, intron only] # 412944 Parent [gene, mRNA, CDS, exon, intron, match, match_part only] # 116103 object_oid [gene, match only] # 2666 species [chromosome*, golden_path* only] # 11493 to_name [orthologous_region only] # 13271 to_species [orthologous_region, syntenic_region only] # Supposedly this will be the stable standard, but for now I wrote # a one-shot perl script: extractGenes.pl flybase.gff3 # Get predicted proteins (for main annotations only) wget ftp://flybase.net/genomes/Drosophila_pseudoobscura/current/fasta/dpse-all-translation-r1.03.fasta.gz zcat dpse-all-translation-r1.03.fasta.gz \ | perl -wpe 's/^(>\w+)-PA/$1-RA/' > flybasePep.fa ssh hgwdev cd /cluster/data/dp3/bed/flybase ldHgGene -gtf dp3 flyBaseGene flyBase.gtf hgPepPred dp3 generic flyBasePep flybasePep.fa ldHgGene -gtf dp3 twinscan twinscan.gtf ldHgGene -gtf dp3 genewise genewise.gtf ldHgGene -gtf dp3 fbGenscan genscan.gtf featureBits dp3 -enrichment fbGenscan genscan #fbGenscan 18.578%, genscan 18.793%, both 17.662%, cover 95.07%, enrich 5.06x # Their genscan and ours are similar enough that we should drop one. # In featureBits -enrichment flyBaseGene, ours does a hair better so # drop FlyBase's. hgsql dp3 -e 'drop table fbGenscan' # Cross-referencing info for both coding and noncoding: hgsql dp3 < $HOME/kent/src/hg/lib/flyBase2004Xref.sql hgsql dp3 -e 'load data local infile "flyBase2004Xref.tab" \ into table flyBase2004Xref' # SWAP CHAINS FROM DM2, BUILD NETS ETC. (DONE 5/23/05 angie) ssh kksilo mkdir /cluster/data/dp3/bed/blastz.dm2.swap cd /cluster/data/dp3/bed/blastz.dm2.swap doBlastzChainNet.pl -swap /cluster/data/dm2/bed/blastz.dp3/DEF \ -workhorse kkr7u00 >& do.log & tail -f do.log # Add {chain,net}Dm2 to trackDb.ra if necessary. # MAKE Drosophila Proteins track (DONE 06-30-05 braney) ssh kk mkdir -p /cluster/data/dp3/bed/tblastn.dm2FB cd /cluster/data/dp3/bed/tblastn.dm2FB ls -1S /iscratch/i/dp3/blastDb/*.nsq | sed "s/\.nsq//" > target.lst mkdir fbfa # calculate a reasonable number of jobs (didn't actually do this for dp3) calc `wc /cluster/data/dm2/bed/blat.dm2FB/dm2FB.psl | awk "{print \\\$1}"`/\(164630/`wc target.lst | awk "{print \\\$1}"`\) # 18929/(164630/417) = 47.946261 split -l 50 /cluster/data/dm2/bed/blat.dm2FB/dm2FB.psl fbfa/fb cd fbfa for i in *; do pslxToFa $i $i.fa; rm $i; done cd .. ls -1S fbfa/*.fa > fb.lst rm -rf /cluster/bluearc/dp3/bed/tblastn.dm2FB/blastOut mkdir -p /cluster/bluearc/dp3/bed/tblastn.dm2FB/blastOut ln -s /cluster/bluearc/dp3/bed/tblastn.dm2FB/blastOut for i in `cat fb.lst`; do mkdir blastOut/`basename $i .fa`; done tcsh cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/iscratch/i/blast/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/dp3/jkStuff/subChr.lft warn $f.2 liftUp -nosort -type=".psl" -nohead $f.4 /cluster/data/dp3/jkStuff/liftAll.lft warn $f.3 liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/dm2/bed/blat.dm2FB/protein.lft warn $f.4 mv $3.tmp $3 rm -f $f.1 $f.2 $f.3 exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.3 $f.8 exit 1 '_EOF_' chmod +x blastSome gensub2 target.lst fb.lst blastGsub blastSpec ssh kk cd /cluster/data/dp3/bed/tblastn.dm2FB para create blastSpec para push # Completed: 143641 of 143641 jobs # CPU time in finished jobs: 1055248s 17587.47m 293.12h 12.21d 0.033 y # IO & Wait Time: 546039s 9100.65m 151.68h 6.32d 0.017 y # Average job time: 11s 0.19m 0.00h 0.00d # Longest finished job: 107s 1.78m 0.03h 0.00d # Submission to last job: 20041s 334.02m 5.57h 0.23d ssh kki cd /cluster/data/dp3/bed/tblastn.dm2FB tcsh cat << '_EOF_' > chainGsub #LOOP chainSome $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainSome (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=20000 stdin ../c.`basename $1`.psl) '_EOF_' chmod +x chainSome ls -1dS `pwd`/blastOut/fb?? > chain.lst gensub2 chain.lst single chainGsub chainSpec para create chainSpec para push # Completed: 379 of 379 jobs # CPU time in finished jobs: 14757s 245.95m 4.10h 0.17d 0.000 y # IO & Wait Time: 9660s 161.00m 2.68h 0.11d 0.000 y # Average job time: 64s 1.07m 0.02h 0.00d # Longest finished job: 2472s 41.20m 0.69h 0.03d # Submission to last job: 4737s 78.95m 1.32h 0.05d ssh kkstore01 cd /cluster/data/dp3/bed/tblastn.dm2FB/blastOut for i in fb?? do awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.psl > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/dp3/bed/tblastn.dm2FB/blastDm2FB.psl cd .. ssh hgwdev cd /cluster/data/dp3/bed/tblastn.dm2FB hgLoadPsl dp3 blastDm2FB.psl exit # back to kksilo rm -rf blastOut # End tblastn # GENEID PREDICTIONS FROM IMIM (DONE 7/26/05 angie) ssh hgwdev mkdir /cluster/data/dp3/bed/geneid cd /cluster/data/dp3/bed/geneid foreach chr (`awk '{print $1;}' ../../chrom.sizes`) wget http://genome.imim.es/genepredictions/D.pseudoobscura/golden_path_200411/geneidv1.2/$chr.gtf end ldHgGene -gtf -genePredExt dp3 geneid *.gtf ########################################################################### # SWAP/CHAIN/NET DM3 (DONE 6/7/07 angie) ssh kkstore04 mkdir /cluster/data/dp3/bed/blastz.dm3.swap cd /cluster/data/dp3/bed/blastz.dm3.swap doBlastzChainNet.pl -swap /cluster/data/dm3/bed/blastz.dp3/DEF >& do.log & tail -f do.log ln -s blastz.dm3.swap /cluster/data/dp3/bed/blastz.dm3