# for emacs: -*- mode: sh; -*- # Apis mellifera -- # # Baylor HGSC's 20 July 2004 (Amel_1.2) assembly # http://www.hgsc.bcm.tmc.edu/projects/honeybee/ # # DOWNLOAD SEQUENCE (DONE 11/8/04 angie) ssh kksilo mkdir /cluster/store8/apiMel1 cd /cluster/data ln -s /cluster/store8/apiMel1 apiMel1 cd /cluster/data/apiMel1 mkdir jkStuff bed mkdir downloads cd downloads wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Amellifera/fasta/Amel20040120-freeze/README.TXT wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Amellifera/fasta/Amel20040120-freeze/scaffold_contig_20040120.agp foreach g (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 Un) wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Amellifera/fasta/Amel20040120-freeze/linearScaffolds/Group${g}_20040120.fa.gz wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Amellifera/fasta/Amel20040120-freeze/linearScaffolds/Group${g}_20040120.fa.qual.gz end # Their fasta files contain some bogus empty records (">?50000" followed # by no sequence)... maybe meant to represent gaps in a concatenation of # these into chr*_randoms??). Filter those out. # Also strip "gnl|Amel_1.2|" from beginning of headers. zcat G*.fa.gz \ | egrep -v '^>\?[0-9]+$' \ | sed -e 's/gnl|Amel_1.2|//' \ > scaffolds.fa faSize scaffolds.fa #213394811 bases (4964694 N's 208430117 real) in 9751 sequences in 1 files #Total size: mean 21884.4 sd 86304.1 min 632 (GroupUn.6611) max 2167388 (Group6.15) median 2189 #N count: mean 509.1 sd 2060.5 # Strip "gnl|Amel_1.2|" from scaffold and contig names in AGP. sed -e 's/gnl|Amel_1.2|//g; s/Amel_1.2_Contig/Contig/' \ scaffold_contig_20040120.agp > scaffolds.agp # PARTITION SCAFFOLDS FOR REPEATMASKER RUN (DONE 11/8/04 angie) # Chop up large scaffolds into ~500kb chunks for RepeatMasker, # then glom the tiny scaffolds up into ~200k collections (looks like # some almost-500k pieces are glommed together --> some large chunks, # but that's OK). ssh kksilo cd /cluster/data/apiMel1 mkdir scaffoldsSplit faSplit size downloads/scaffolds.fa 500000 -oneFile \ scaffoldsSplit -lift=jkStuff/scaffoldsSplit.lft mkdir chunks500k faSplit about scaffoldsSplit.fa 200000 chunks500k/chunk_ # CREATING DATABASE (DONE 11/8/04 angie) # Create the database. ssh hgwdev # Make sure there is at least 5 gig free for the database df -h /var/lib/mysql #/dev/sdc1 1.8T 641G 1019G 39% /var/lib/mysql hgsql '' -e 'create database apiMel1' # RUN REPEAT MASKER (DONE 11/8/04 angie) # January ("March") '04 version of RepeatMasker and libs. # Follow Ensembl's lead and run with drosophila libs. Also, just # for the heck of it, run with anopheles.lib too and concat results. ssh kksilo cd /cluster/data/apiMel1 # Next time, add -nolow to the -lib run to avoid duplicated effort/outputs. cat << '_EOF_' > jkStuff/RMApis #!/bin/csh -fe cd $1 /bin/mkdir -p /tmp/apiMel1/$2 /bin/cp ../chunks500k/$2 /tmp/apiMel1/$2/ pushd /tmp/apiMel1/$2 /cluster/bluearc/RepeatMasker/RepeatMasker -s -lib /cluster/bluearc/RepeatMasker/Libraries/anopheles.lib $2 mv $2.out $2.anopheles.out /cluster/bluearc/RepeatMasker/RepeatMasker -s -spec drosophila $2 tail +4 $2.anopheles.out >> $2.out popd /bin/cp /tmp/apiMel1/$2/$2.out ./ /bin/rm -fr /tmp/apiMel1/$2/* /bin/rmdir --ignore-fail-on-non-empty /tmp/apiMel1/$2 /bin/rmdir --ignore-fail-on-non-empty /tmp/apiMel1 '_EOF_' # << this line makes emacs coloring happy chmod +x jkStuff/RMApis mkdir RMRun RMOut cp /dev/null RMRun/RMJobs foreach f ( `ls -1S chunks500k/*.fa` ) set chunk = $f:t echo ../jkStuff/RMApis \ /cluster/data/apiMel1/RMOut $chunk \ '{'check in line+ /cluster/data/apiMel1/$f'}' \ '{'check out line+ /cluster/data/apiMel1/RMOut/$chunk.out'}' \ >> RMRun/RMJobs end # do the run ssh kk cd /cluster/data/apiMel1/RMRun para create RMJobs para try, check, push, check,... #Completed: 682 of 682 jobs #Average job time: 5942s 99.03m 1.65h 0.07d #Longest job: 14569s 242.82m 4.05h 0.17d #Submission to last job: 14569s 242.82m 4.05h 0.17d # Lift up the split-scaffold .out's to scaffold .out's ssh kksilo cd /cluster/data/apiMel1 foreach f (RMOut/*.fa.out) liftUp $f:r:r.scaf.out jkStuff/scaffoldsSplit.lft warn $f > /dev/null end # Make a consolidated scaffold .out file too: head -3 RMOut/chunk_00.fa.out > RMOut/scaffolds.fa.out foreach f (RMOut/*.scaf.out) tail +4 $f >> RMOut/scaffolds.fa.out end # Load the .out files into the database with: ssh hgwdev hgLoadOut apiMel1 /cluster/data/apiMel1/RMOut/scaffolds.fa.out # hgLoadOut made a "scaffolds_rmsk" table even with -table=rmsk, # but we want a non-split with no prefix table: hgsql apiMel1 -e 'rename table scaffolds_rmsk to rmsk' # Fix up the indices too: hgsql apiMel1 -e 'drop index bin on rmsk; \ drop index genoStart on rmsk; \ drop index genoEnd on rmsk; \ create index bin on rmsk (genoName(11), bin); \ create index genoStart on rmsk (genoName(11), genoStart); \ create index genoEnd on rmsk (genoName(11), genoEnd);' # THIS APPLIES ONLY WHEN REPEATMASKER IS RUN MULTIPLE TIMES PER SEQUENCE: # 11/29/04: Need to remove duplicate Simple/Low Complexity repeats # caused by the two separate RepeatMasker invocations. Unfortunately # whitespace is different in the .out files, so sort & uniq the .tab: ssh hgwdev cd /cluster/data/apiMel1/RMOut hgLoadOut -tabFile=hgLoadOut.tab apiMel1 -nosplit scaffolds.fa.out sort -k 6,6 -k7n,7n hgLoadOut.tab | uniq > hgLoadOutUniq.tab hgsql apiMel1 -e 'delete from rmsk' hgsql apiMel1 -e 'load data local infile "hgLoadOutUniq.tab" into table rmsk' # LOAD GAP & GOLD TABLES FROM AGP (DONE 11/8/04 angie) # NOTE: RELOADED WITH FIX 12/17/04, SEE "MAKE CORRECTED SCAFFOLD AGP" BELOW ssh hgwdev hgGoldGapGl -noGl apiMel1 /cluster/data/apiMel1/downloads/scaffolds.agp # For some reason, the indices did not get built correctly -- # "show index from gap/gold" shows NULL cardinalities for chrom. # Rebuild indices with "analyze table". hgsql apiMel1 -e 'analyze table gold; analyze table gap;' # SIMPLE REPEATS (TRF) (DONE 11/8/04 angie) ssh kksilo mkdir /cluster/data/apiMel1/bed/simpleRepeat cd /cluster/data/apiMel1/bed/simpleRepeat nice trfBig -trf=/cluster/bin/i386/trf ../../downloads/scaffolds.fa \ /dev/null -bedAt=simpleRepeat.bed -tempDir=/tmp \ |& egrep -v '^(Removed|Tandem|Copyright|Loading|Allocating|Initializing|Computing|Scanning|Freeing)' \ > trf.log & # check on this with tail -f trf.log # Load this into the database as so ssh hgwdev hgLoadBed apiMel1 simpleRepeat \ /cluster/data/apiMel1/bed/simpleRepeat/simpleRepeat.bed \ -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql # FILTER SIMPLE REPEATS (TRF) INTO MASK (DONE 11/8/04 angie) # make a filtered version of the trf output: # keep trf's with period <= 12: ssh kksilo cd /cluster/data/apiMel1/bed/simpleRepeat awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed # MASK FA USING REPEATMASKER AND FILTERED TRF FILES (DONE 11/8/04 angie) ssh kksilo cd /cluster/data/apiMel1 maskOutFa -soft downloads/scaffolds.fa bed/simpleRepeat/trfMask.bed \ scaffolds.fa maskOutFa -softAdd scaffolds.fa RMOut/scaffolds.fa.out scaffolds.fa # Now clean up the unmasked split scaffolds to avoid confusion later. rm -r chunks500k scaffoldsSplit.fa jkStuff/scaffoldsSplit.lft # STORE SEQUENCE AND ASSEMBLY INFORMATION (DONE 11/8/04 angie) # Translate to 2bit ssh kksilo cd /cluster/data/apiMel1 faToTwoBit scaffolds.fa apiMel1.2bit # Make chromInfo.tab. mkdir bed/chromInfo twoBitInfo apiMel1.2bit stdout \ | awk '{printf "%s\t%s\t/gbdb/apiMel1/apiMel1.2bit\n", $1, $2;}' \ > bed/chromInfo/chromInfo.tab # Make symbolic a link from /gbdb/apiMel1/ to the 2bit. ssh hgwdev mkdir -p /gbdb/apiMel1 ln -s /cluster/data/apiMel1/apiMel1.2bit /gbdb/apiMel1/ # Load chromInfo table. hgsql apiMel1 < $HOME/kent/src/hg/lib/chromInfo.sql hgsql apiMel1 -e 'load data local infile \ "/cluster/data/apiMel1/bed/chromInfo/chromInfo.tab" into table chromInfo' # Make chrom.sizes from chromInfo contents and check scaffold count. hgsql apiMel1 -N -e 'select chrom,size from chromInfo' \ > /cluster/data/apiMel1/chrom.sizes wc -l /cluster/data/apiMel1/chrom.sizes # 9751 /cluster/data/apiMel1/chrom.sizes # CREATING GRP TABLE FOR TRACK GROUPING (DONE 11/8/04 angie) # Copy all the data from the table "grp" # in an existing database to the new database ssh hgwdev hgsql apiMel1 -e 'create table grp (PRIMARY KEY(NAME)) select * from hg17.grp' # MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE (DONE 11/8/04 angie) # Warning: genome and organism fields must correspond # with defaultDb values echo 'INSERT INTO dbDb \ (name, description, nibPath, organism, \ defaultPos, active, orderKey, genome, scientificName, \ htmlPath, hgNearOk, hgPbOk, sourceName) values \ ("apiMel1", "Jul. 2004", "/gbdb/apiMel1", "A. mellifera", \ "GroupUn.5651:1001-6000", 1, 57, \ "A. mellifera", \ "Apis mellifera", "/gbdb/apiMel1/html/description.html", \ 0, 0, "Baylor HGSC Amel_1.2");' \ | hgsql -h genome-testdb hgcentraltest echo 'INSERT INTO defaultDb (genome, name) values ("A. mellifera", "apiMel1");' \ | hgsql -h genome-testdb hgcentraltest # Make trackDb table so browser knows what tracks to expect: ssh hgwdev cd ~/kent/src/hg/makeDb/trackDb cvsup # Edit trackDb/makefile to add apiMel1 to the DBS variable. mkdir apis mkdir apis/apiMel1 # Create a simple apis/apiMel1/description.html file. cvs add apis cvs add apis/apiMel1 cvs add apis/apiMel1/description.html make update DBS=apiMel1 ZOO_DBS= # go public on genome-test cvs ci -m "Added apiMel1 (Apis mellifera, honeybee)." makefile cvs ci -m "First-cut trackDb.ra and description.html for apiMel1 (Apis mellifera, honeybee)." apis mkdir /gbdb/apiMel1/html # in a clean, updated tree's kent/src/hg/makeDb/trackDb: make alpha # PUT SEQUENCE ON /ISCRATCH FOR BLASTZ (DONE 11/8/04 angie) # First, agglomerate small scaffolds into chunks of ~500k median # (many scaffolds are larger than that) so we don't have too many # files for one dir, but keep a reasonably low job run time: ssh kksilo cd /cluster/data/apiMel1 mkdir chunksUnsplit faSplit about scaffolds.fa 500000 chunksUnsplit/chunk_ ssh kkr1u00 mkdir /iscratch/i/apiMel1 cp -pR /cluster/data/apiMel1/chunksUnsplit /iscratch/i/apiMel1/ cp -p /cluster/data/apiMel1/apiMel1.2bit /iscratch/i/apiMel1/ iSync # PRODUCING GENSCAN PREDICTIONS (DONE 11/8/04 angie) ssh kksilo # Make hard-masked scaffolds and split up for processing: cd /cluster/data/apiMel1 maskOutFa scaffolds.fa hard scaffolds.fa.masked mkdir chunksUnsplitMasked faSplit about scaffolds.fa.masked 1000000 chunksUnsplitMasked/chunk_ mkdir /cluster/data/apiMel1/bed/genscan cd /cluster/data/apiMel1/bed/genscan # Check out hg3rdParty/genscanlinux to get latest genscan: cvs co hg3rdParty/genscanlinux # Make 3 subdirectories for genscan to put their output files in mkdir gtf pep subopt ls -1S ../../chunksUnsplitMasked/chunk*.fa > chunks.list cat << '_EOF_' > gsub #LOOP gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000 #ENDLOOP '_EOF_' # << this line keeps emacs coloring happy gensub2 chunks.list single gsub jobList ssh kki cd /cluster/data/apiMel1/bed/genscan para create jobList para try, check, push, check, ... #Completed: 184 of 185 jobs #Crashed: 1 jobs #Average job time: 26s 0.43m 0.01h 0.00d #Longest job: 55s 0.92m 0.02h 0.00d #Submission to last job: 357s 5.95m 0.10h 0.00d # If there are crashes, diagnose with "para problems". # If a job crashes due to genscan running out of memory, re-run it # manually with "-window=1200000" instead of "-window=2400000". # Wow, 1.2M and 1M crashed too. Dropped down to .6M: ssh kolossus cd /cluster/data/apiMel1/bed/genscan gsBig ../../chunksUnsplitMasked/chunk_60.fa gtf/chunk_60.gtf -trans=pep/chunk_60.pep -subopt=subopt/chunk_60.bed -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=600000 # Concatenate scaffold-level results: ssh kksilo cd /cluster/data/apiMel1/bed/genscan cat gtf/*.gtf > genscan.gtf cat subopt/*.bed > genscanSubopt.bed cat pep/*.pep > genscan.pep # Clean up rm -r /cluster/data/apiMel1/chunksUnsplitMasked # Load into the database as so: ssh hgwdev cd /cluster/data/apiMel1/bed/genscan # Reloaded without -genePredExt 1/6/05: ldHgGene -gtf apiMel1 genscan genscan.gtf hgPepPred apiMel1 generic genscanPep genscan.pep hgLoadBed apiMel1 genscanSubopt genscanSubopt.bed # MYTOUCH FIX - jen - 2006-01-24 sudo mytouch apiMel1 genscanPep 0501071300.00 # MAKE DOWNLOADABLE FILES (DONE 11/8/04 angie) ssh kksilo mkdir /cluster/data/apiMel1/zips cd /cluster/data/apiMel1 zip -j zips/scaffoldOut.zip RMOut/scaffolds.fa.out zip -j zips/scaffoldFa.zip scaffolds.fa zip -j zips/scaffoldFaMasked.zip scaffolds.fa.masked zip -j zips/scaffoldTrf.zip bed/simpleRepeat/trfMask.bed foreach f (zips/*.zip) echo $f unzip -t $f | tail -1 end ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/apiMel1 cd /usr/local/apache/htdocs/goldenPath/apiMel1 mkdir bigZips database # Create README.txt files in bigZips/ and database/ to explain the files. cd bigZips cp -p /cluster/data/apiMel1/zips/*.zip . md5sum *.zip > md5sum.txt # SWAP DM1-APIMEL1 BLASTZ (DONE 11/8/04 angie) ssh kksilo mkdir /cluster/data/apiMel1/bed/blastz.dm1.swap.2004-11-08 ln -s blastz.dm1.swap.2004-11-08 /cluster/data/apiMel1/bed/blastz.dm1 cd /cluster/data/apiMel1/bed/blastz.dm1 set aliDir = /cluster/data/dm1/bed/blastz.apiMel1 cp $aliDir/S1.len S2.len cp $aliDir/S2.len S1.len # With 11k scaffolds, we don't want a directory with one file per # scaffold. So just make one .axt with everything -- not too huge # anyway, given these little insect genomes. zcat $aliDir/axtChrom/chr*.axt.gz \ | axtSwap stdin $aliDir/S1.len $aliDir/S2.len stdout \ | axtSort stdin dm1.axt # CHAIN MELANOGASTER BLASTZ (DONE 11/8/04 angie) # Run axtChain on kolossus (one big dm1.axt input) ssh kolossus mkdir /cluster/data/apiMel1/bed/blastz.dm1/axtChain cd /cluster/data/apiMel1/bed/blastz.dm1/axtChain axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \ -linearGap=/cluster/data/blastz/chickenHumanTuned.gap \ -verbose=0 ../dm1.axt /cluster/data/apiMel1/apiMel1.2bit \ /cluster/data/dm1/nib stdout \ | chainAntiRepeat /cluster/data/apiMel1/apiMel1.2bit \ /cluster/data/dm1/nib stdin stdout \ | chainMergeSort stdin > all.chain # Load chains into database ssh hgwdev cd /cluster/data/apiMel1/bed/blastz.dm1/axtChain hgLoadChain -tIndex apiMel1 chainDm1 all.chain # NET MELANOGASTER BLASTZ (DONE 11/9/04 angie) ssh kksilo cd /cluster/data/apiMel1/bed/blastz.dm1/axtChain chainPreNet all.chain ../S1.len ../S2.len stdout \ | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \ | netSyntenic stdin noClass.net # Add classification info using db tables: ssh hgwdev cd /cluster/data/apiMel1/bed/blastz.dm1/axtChain netClass -noAr noClass.net apiMel1 dm1 melanogaster.net # Make a 'syntenic' subset: ssh kksilo cd /cluster/data/apiMel1/bed/blastz.dm1/axtChain rm noClass.net netFilter -syn melanogaster.net > melanogasterSyn.net # Load the nets into database ssh hgwdev cd /cluster/data/apiMel1/bed/blastz.dm1/axtChain netFilter -minGap=10 melanogaster.net | hgLoadNet apiMel1 netDm1 stdin netFilter -minGap=10 melanogasterSyn.net \ | hgLoadNet apiMel1 netSyntenyDm1 stdin # MAKE AXTNET (DONE 11/9/04 angie) ssh kksilo cd /cluster/data/apiMel1/bed/blastz.dm1/axtChain netToAxt melanogaster.net all.chain /cluster/data/apiMel1/apiMel1.2bit \ /cluster/data/dm1/nib stdout \ | axtSort stdin melanogasterNet.axt # MAKE VSDM1 DOWNLOADABLES (DONE 11/9/04 angie) ssh kksilo cd /cluster/data/apiMel1/bed/blastz.dm1/axtChain nice gzip *.{chain,net,axt} ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/apiMel1/vsDm1 cd /usr/local/apache/htdocs/goldenPath/apiMel1/vsDm1 cp -p /cluster/data/apiMel1/bed/blastz.dm1/axtChain/all.chain.gz \ melanogaster.chain.gz cp -p /cluster/data/apiMel1/bed/blastz.dm1/axtChain/melanogaster.net.gz . cp -p /cluster/data/apiMel1/bed/blastz.dm1/axtChain/melanogasterNet.axt.gz . # Make a README.txt which explains the files & formats. md5sum *.gz */*.gz > md5sum.txt # MAKE 11.OOC FILE FOR BLAT (DONE 11/8/04 angie) # Use -repMatch=100 (based on size -- for human we use 1024, and # fly size is ~4.4% of human judging by gapless dm1 genome size from # featureBits -- we would use 45, but bump that up a bit to be more # conservative). ssh kkr1u00 mkdir /cluster/bluearc/apiMel1 blat /cluster/data/apiMel1/apiMel1.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=/cluster/bluearc/apiMel1/11.ooc -repMatch=100 #Wrote 9721 overused 11-mers to /cluster/bluearc/apiMel1/11.ooc cp -p /cluster/bluearc/apiMel1/*.ooc /iscratch/i/apiMel1/ iSync # GET GENBANK mRNA AND EST COUNTS (DONE 11/8/04 angie) # Go to the latest GenBank full release dir and get an idea of how # many mRNAs and ESTs there are to align. ssh eieio cd /cluster/data/genbank/data/processed/genbank.144.0/full awk '$4 == "Apis" {print $4 " " $5;}' mrna.gbidx | sort | uniq -c #... # 194 Apis mellifera # OK, that's enough to justify a native mRNA track. (more than a # motivated researcher could send to hgBlat at once.) awk '$4 == "Apis" {print $4 " " $5;}' est*.gbidx | sort | uniq -c # 62 Apis cerana # 24643 Apis mellifera # Not so bad! # AUTO UPDATE GENBANK MRNA RUN (DONE 11/23/04 angie) ssh hgwdev # Update genbank config and source in CVS: cd ~/kent/src/hg/makeDb/genbank cvsup . # Edit etc/genbank.conf and add these lines (note scaffold-browser settings): # apiMel1 (A. mellifera) apiMel1.genome = /iscratch/i/apiMel1/apiMel1.2bit apiMel1.mondoTwoBitParts = 1000 apiMel1.lift = no apiMel1.refseq.mrna.native.load = no apiMel1.genbank.mrna.xeno.load = yes apiMel1.genbank.est.xeno.load = no apiMel1.downloadDir = apiMel1 apiMel1.perChromTables = no cvs ci etc/genbank.conf # Since A. mellifera is a new species for us, edit src/lib/gbGenome.c. # Pick some other browser species, & monkey-see monkey-do. cvs diff src/lib/gbGenome.c make cvs ci src/lib/gbGenome.c # Edit src/align/gbBlat to add /iscratch/i/apiMel1/11.ooc cvs diff src/align/gbBlat make cvs ci src/align/gbBlat # Install to /cluster/data/genbank: make install-server ssh `fileServer /cluster/data/genbank/` cd /cluster/data/genbank # This is an -initial run, mRNA only: nice bin/gbAlignStep -srcDb=genbank -type=mrna -initial apiMel1 & tail -f [its logfile] # Load results: ssh hgwdev cd /cluster/data/genbank nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad apiMel1 featureBits apiMel1 all_mrna #157831 bases of 208434669 (0.076%) in intersection featureBits apiMel1 xenoMrna #7164827 bases of 208434669 (3.437%) in intersection # Clean up: rm -rf work/initial.apiMel1 # -initial for ESTs nice bin/gbAlignStep -srcDb=genbank -type=est -initial apiMel1 & tail -f [its logfile] # Load results: ssh hgwdev cd /cluster/data/genbank nice bin/gbDbLoadStep -verbose=1 apiMel1 & featureBits apiMel1 all_est #5563769 bases of 208434669 (2.669%) in intersection # Clean up: rm -rf work/initial.apiMel1 # MAKE GCPERCENT (REDONE 11/28/04 angie) # b0b found that the gcPercent table was empty -- don't know how that # happened! no errors & full table after rerunning. ssh hgwdev mkdir /cluster/data/apiMel1/bed/gcPercent cd /cluster/data/apiMel1/bed/gcPercent # create and load gcPercent table hgGcPercent apiMel1 /cluster/data/apiMel1 # MAKE HGCENTRALTEST BLATSERVERS ENTRY (TODO 11/?/04 angie) ssh hgwdev echo 'insert into blatServers values("apiMel1", "blat?", "177??", 1, 0); \ insert into blatServers values("apiMel1", "blat?", "177??", 0, 1);' \ | hgsql -h genome-testdb hgcentraltest # MAKE Drosophila Proteins track (DONE braney 11/17/04) ssh kksilo mkdir -p /cluster/data/apiMel1/blastDb cd /cluster/data/apiMel1/blastDb faSplit sequence ../scaffolds.fa 400 x for i in *.fa; do formatdb -i $i -p F 2> /dev/null; done rm *.fa *.log ssh kkr1u00 mkdir -p /iscratch/i/apiMel1/blastDb cp /cluster/data/apiMel1/blastDb/* /iscratch/i/apiMel1/blastDb (iSync) 2>&1 > sync.out mkdir -p /cluster/data/apiMel1/bed/tblastn.dm1FB cd /cluster/data/apiMel1/bed/tblastn.dm1FB ls -1S /iscratch/i/apiMel1/blastDb/*.nsq | sed "s/\.nsq//" > bug.lst exit # back to kksilo cd /cluster/data/apiMel1/bed/tblastn.dm1FB mkdir fbfa # calculate a reasonable number of jobs calc `wc /cluster/data/dm1/bed/blat.dm1FB/dm1FB.psl | awk "{print \\\$1}"`/\(150000/`wc bug.lst | awk "{print \\\$1}"`\) # 18735/(150000/396) = 49.460400 split -l 49 /cluster/data/dm1/bed/blat.dm1FB/dm1FB.psl fbfa/fb cd fbfa for i in *; do pslxToFa $i $i.fa; rm $i; done cd .. ls -1S fbfa/*.fa > fb.lst mkdir blastOut for i in `cat fb.lst`; do mkdir blastOut/`basename $i .fa`; done cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/iscratch/i/blast/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /iscratch/i/dm1/protein.lft warn $f.2 mv $3.tmp $3 rm -f $f.1 $f.2 exit 0 fi fi rm -f $f.1 $f.2 $3.tmp exit 1 '_EOF_' chmod +x blastSome gensub2 bug.lst fb.lst blastGsub blastSpec ssh kk cd /cluster/data/apiMel1/bed/tblastn.dm1FB para create blastSpec para try, push # Completed: 151668 of 151668 jobs # CPU time in finished jobs: 2212410s 36873.50m 614.56h 25.61d 0.070 y # IO & Wait Time: 704680s 11744.66m 195.74h 8.16d 0.022 y # Average job time: 19s 0.32m 0.01h 0.00d # Longest job: 287s 4.78m 0.08h 0.00d # Submission to last job: 59559s 992.65m 16.54h 0.69d cat << '_EOF_' > chainGsub #LOOP chainSome $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainSome (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=25000 stdin ../c.`basename $1`.psl) '_EOF_' chmod +x chainSome ls -1dS `pwd`/blastOut/fb?? > chain.lst gensub2 chain.lst single chainGsub chainSpec para create chainSpec # should run this on the mini-cluster or with my shove script # so you can limit the number of jobs starting to 3 or 4 para try, push... # Completed: 383 of 383 jobs # CPU time in finished jobs: 189s 3.16m 0.05h 0.00d 0.000 y # IO & Wait Time: 7289s 121.48m 2.02h 0.08d 0.000 y # Average job time: 20s 0.33m 0.01h 0.00d # Longest job: 38s 0.63m 0.01h 0.00d # Submission to last job: 599s 9.98m 0.17h 0.01d exit # back to kksilo cd /cluster/data/apiMel1/bed/tblastn.dm1FB/blastOut for i in fb?? do awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.psl > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/apiMel1/bed/tblastn.dm1FB/blastDm1FB.psl ssh hgwdev cd /cluster/data/apiMel1/bed/tblastn.dm1FB hgLoadPsl apiMel1 blastDm1FB.psl # End tblastn # EXTRACT GAP INFO TO COMPARE WITH PROVIDED AGP (DONE 12/13/04 angie) # Doh! b0b found that gaps/contigs in the AGP don't always correspond # to N's/real bases! :( Run hgFakeAgp and get some whopper examples # to send to Baylor. ssh kksilo mkdir /cluster/data/apiMel1/bed/fakeAgp cd /cluster/data/apiMel1/bed/fakeAgp # AGP has gaps all the way down to 1 base... follow suit: hgFakeAgp -minContigGap=1 ../../downloads/scaffolds.fa fake.agp awk '$5 == "N" {printf "%s\t%d\t%d\t%s\n", $1, $2-1, $3, $7}' fake.agp \ > extractedGaps.bed ssh hgwdev hgLoadBed apiMel1 extractedGaps \ /cluster/data/apiMel1/bed/fakeAgp/extractedGaps.bed # Used featureBits to intersect gap \!extractedGaps, extracted 118 # scaffolds with swapped coords --> apiMel1.diffGaps # MAKE CORRECTED SCAFFOLD AGP (DONE 12/17/04 angie) ssh kksilo cd /cluster/data/apiMel1/bed/fakeAgp # Made a one-shot perl script to reverse the AGP coords for the # 118 affected scaffolds. ./fixApiMel1Agp.pl ../../downloads/scaffolds.agp > scaffolds.fixed.agp # Reload gap & gold tables from fixed AGP: ssh hgwdev hgGoldGapGl -noGl apiMel1 \ /cluster/data/apiMel1/bed/fakeAgp/scaffolds.fixed.agp # Now make sure they jive with extractedGaps: featureBits apiMel1 gap \!extractedGaps #0 bases of 208434669 (0.000%) in intersection # Pick up photo from NHGRI (DONE - 2004-12-22 - Hiram) ssh hgwdev cd /tmp wget \ 'http://www.genome.gov/Pages/News/Photos/Science/honeybee_image.jpg' \ -O honeybee_image.jpg # view that image in 'display' to determine crop edges, then: convert -crop 1760x2000+240+480 -quality 80 -sharpen 0 \ -normalize honeybee_image.jpg aam.jpg convert -geometry 200x200 -quality 80 aam.jpg Apis_mellifera.jpg rm -f aam.jpg honeybee_image.jpg cp -p Apis_mellifera.jpg /usr/local/apache/htdocs/images # add links to this image in the description.html page, request push # SWAP DM2-APIMEL1 BLASTZ (DONE 1/31/05 angie) # OBSOLETED 1/13/06 angie -- see SWAP/CHAIN/NET DM2 below... # CHAIN MELANOGASTER BLASTZ (DONE 1/31/05 angie) # OBSOLETED 1/13/06 angie -- see SWAP/CHAIN/NET DM2 below... # NET MELANOGASTER BLASTZ (DONE 1/31/05 angie) # OBSOLETED 1/13/06 angie -- see SWAP/CHAIN/NET DM2 below... # MAKE AXTNET (DONE 1/31/05 angie) # OBSOLETED 1/13/06 angie -- see SWAP/CHAIN/NET DM2 below... # MAKE VSDM2 DOWNLOADABLES (DONE 2/2/05 angie) # OBSOLETED 1/13/06 angie -- see SWAP/CHAIN/NET DM2 below... # SWAP/CHAIN/NET DM2 (DONE 1/17/06 angie) # Get the latest file conventions... ssh hgwdev rm /cluster/data/apiMel1/bed/blastz.dm2 mkdir /cluster/data/apiMel1/bed/blastz.dm2.swap cd /cluster/data/apiMel1/bed/blastz.dm2.swap doBlastzChainNet.pl -swap /cluster/data/dm2/bed/blastz.apiMel1/DEF >& do.log & tail -f do.log ln -s blastz.dm2.swap /cluster/data/apiMel1/bed/blastz.dm2