# for emacs: -*- mode: sh; -*- # Apis mellifera -- # # Baylor HGSC's January 20, 2005 (Amel_2.0) assembly # http://www.hgsc.bcm.tmc.edu/projects/honeybee/ # # NOTE: this doc may have genePred loads that fail to include # the bin column. Please correct that for the next build by adding # a bin column when you make any of these tables: # # mysql> SELECT tableName, type FROM trackDb WHERE type LIKE "%Pred%"; # +--------------+---------------------+ # | tableName | type | # +--------------+---------------------+ # | modelRefGene | genePred . | # | ensGene | genePred ensPep | # | brhInparalog | genePred . | # | geneid | genePred geneidPep | # | genscan | genePred genscanPep | # +--------------+---------------------+ # DOWNLOAD SEQUENCE (DONE 2/1/05 Andy) ssh kksilo mkdir /cluster/store10/apiMel2 cd /cluster/data ln -s /cluster/store10/apiMel2 apiMel2 cd /cluster/data/apiMel2 mkdir -p jkStuff bed downloads/fa downloads/contigs cd downloads wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Amellifera/fasta/Amel20050120-freeze/README_Amel_2.0_20050120.TXT cd contigs for grp in `seq 1 16` Un; do file=Group${grp} wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Amellifera/fasta/Amel20050120-freeze/contigs/${file}_20050120.agp wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Amellifera/fasta/Amel20050120-freeze/contigs/${file}_20050120.fa.gz mv ${file}_20050120.agp $file.agp mv ${file}_20050120.fa.gz $file.fa.gz zcat $file.fa.gz | egrep -v '^>\?[0-9]+$' | sed -e \ 's/gnl|Amel_2.0|//; s/Amel_2.0_Contig/Contig/' | awk '{print $1}' > $file.contigs.fa cat $file.contigs.fa >> allContigs.fa sed -e 's/gnl|Amel_2.0|//g; s/Amel_2.0_Contig/Contig/' $file.agp > new.agp mv new.agp $file.agp agpToFa -simpleMultiMixed $file.agp $file $file.fa $file.contigs.fa cat $file.fa >> all.fa mv $file.fa ../fa gzip $file.contigs.fa done rm *.fa.gz faSize allContigs.fa #224752160 bases (1637 N's 224750523 real 224750523 upper 0 lower) in 16028 sequences in 1 files #Total size: mean 14022.5 sd 22045.0 min 607 (Contig11541) max 314907 (Contig661) median 5126 #N count: mean 0.1 sd 0.8 #U count: mean 14022.4 sd 22044.9 #L count: mean 0.0 sd 0.0 faSize all.fa #502497597 bases (277747074 N's 224750523 real 224750523 upper 0 lower) in 17 seq uences in 1 files #Total size: mean 29558682.2 sd 75264208.4 min 5608962 (Group16) max 321143811 (G roupUn) median 10665743 #N count: mean 16338063.2 sd 60963796.3 #U count: mean 13220619.0 sd 14657764.0 #L count: mean 0.0 sd 0.0 # CREATING DATABASE (DONE 1/27/05 Andy) # Create the database. ssh hgwdev # Make sure there is at least 5 gig free for the database df -h /var/lib/mysql #/dev/sdc1 1.8T 641G 1019G 39% /var/lib/mysql hgsql '' -e 'create database apiMel2' # PARTITION SCAFFOLDS FOR REPEATMASKER RUN (DONE 2/2/05 Andy) ssh kksilo cd /cluster/data/apiMel2 mkdir faSplits for fa in downloads/fa/*.fa; do c=${fa#*Group}; c=Group${c%*.fa} mkdir faSplits/$c faSplit -lift=faSplits/$c.lft gap $fa 500000 faSplits/$c/${c}_ done # RUN REPEAT MASKER (DONE 2/3/05 Andy) # January '05 version of RepeatMasker and libs. ssh kksilo cd /cluster/data/apiMel2 # Check the library for apis repeats. /cluster/bluearc/RepeatMasker050112/util/queryRepeatDatabase.pl -species apis -stat #>IS1#ARTEFACT Length = 768 bp #>IS2#ARTEFACT Length = 1331 bp #>IS3#ARTEFACT Length = 1258 bp #>IS5#ARTEFACT Length = 1195 bp #>IS10#ARTEFACT Length = 1329 bp # ..... #>Mariner_AM#DNA/Mariner Length = 937 bp #>MARINA#DNA/Mariner Length = 1267 bp # #Total Sequence Length = 47828 bp # So there's honeybee in the library now, so just run repeat mask with that cat << "_EOF_" > jkStuff/RMApis #!/bin/bash file=`basename $1` grp=${file%_*} chunk=${file%.fa} cd /cluster/data/apiMel2/RMOut mkdir -p /tmp/apiMel2/$chunk cp ../faSplits/$grp/$file /tmp/apiMel2/$chunk/ pushd /tmp/apiMel2/$chunk /cluster/bluearc/RepeatMasker/RepeatMasker -s -spec apis $file popd mkdir -p $grp cp /tmp/apiMel2/$chunk/$file.out ./$grp/$chunk.out rm -fr /tmp/apiMel2/$chunk/* rmdir --ignore-fail-on-non-empty /tmp/apiMel2/$chunk rmdir --ignore-fail-on-non-empty /tmp/apiMel2 _EOF_ # << this line makes emacs coloring happy chmod +x jkStuff/RMApis mkdir RMRun RMOut cd RMRun/ find ../faSplits/ -name '*.fa' > input.lst cat << "_EOF_" > gsub #LOOP ../jkStuff/RMApis $(path1) {check in line+ $(path1)} {check out line+ ../RMOut/$(lastDir1)/$(root1).out} #ENDLOOP _EOF_ gensub2 input.lst single gsub RMJobs # Do the cluster run ssh kk cd /cluster/data/apiMel2/RMRun para create RMJobs para try para push para time #1077 jobs in batch #44332 jobs (including everybody's) in Parasol queue. #Checking finished jobs #Completed: 1077 of 1077 jobs #CPU time in finished jobs: 120949s 2015.82m 33.60h 1.40d 0.004 y #IO & Wait Time: 5901s 98.34m 1.64h 0.07d 0.000 y #Average job time: 118s 1.96m 0.03h 0.00d #Longest job: 262s 4.37m 0.07h 0.00d #Submission to last job: 2609s 43.48m 0.72h 0.03d # lift and load results ssh kksilo cd /cluster/data/apiMel1/RMOut for grp in *; do cd $grp for out in *; do lifted=${out%.out}.lifted.out liftUp $lifted ../../faSplits/$grp.lft warn $out if [ -e ../$grp.out ]; then tail +4 $lifted >> ../$grp.out else cp $lifted ../$grp.out fi done cd ../ if [ -e all.out ]; then tail +4 $grp.out else cp $grp.out all.out fi done ssh hgwdev cd /cluster/data/apiMel1/RMOut hgLoadOut apiMel2 all.out hgsql apiMel2 -e 'rename table all_rmsk to rmsk' hgsql apiMel2 -e 'drop index bin on rmsk; \ drop index genoStart on rmsk; \ drop index genoEnd on rmsk; \ create index bin on rmsk (genoName(11), bin); \ create index genoStart on rmsk (genoName(11), genoStart); \ create index genoEnd on rmsk (genoName(11), genoEnd);' # LOAD GAP & GOLD TABLES FROM AGP (DONE 2/3/2005 Andy) ssh hgwdev cd /cluster/data/apiMel2/downloads/contigs cat Group?{,?}.agp >> all.agp hgGoldGapGl -noGl apiMel2 all.agp rm all.agp # For some reason, the indices did not get built correctly -- # "show index from gap/gold" shows NULL cardinalities for chrom. # Rebuild indices with "analyze table". # *** Andy's note: the same thing happened in this assembly too. hgsql apiMel2 -e 'analyze table gold; analyze table gap;' # MAKE GC5BASE WIGGLE TRACK (DONE 2/17/2005 Andy) ssh hgwdev mkdir /cluster/data/apiMel2/bed/gc5Base cp /cluster/data/apiMel2/bed/gc5Base hgGcPercent -wigOut -doGaps -file=stdout -win=5 -verbose=2 apiMel2 \ /cluster/data/apiMel2 | wigEncode stdin gc5Base.wig gc5Base.wib mkdir /gbdb/apiMel2/wib ln -s `pwd`/gc5Base.wib /gbdb/apiMel2/wib hgLoadWiggle -pathPrefix=/gbdb/apiMel2/wib apiMel2 gc5Base gc5Base.wig # SIMPLE REPEATS (TRF) (DONE 2/3/2005 Andy) ssh kksilo mkdir -p /cluster/data/apiMel2/bed/simpleRepeat cd /cluster/data/apiMel2/bed/simpleRepeat for f in ../../downloads/fa/*; do grp=${f##*\/}; grp=${grp%.fa} echo nice /cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $f /dev/null -bedAt=simpleRepeat.$grp.bed -tempDir=/tmp >> jobs.sh done chmod +x jobs.sh ./jobs.sh | egrep -v '^(Removed|Tandem|Copyright|Loading|Allocating|Initializing|Computing|Scanning|Freeing)' \ > jobs.log 2> jobs.errors.log & # check on this with tail -f jobs.log for bed in *.bed; do cat $bed >> simpleRepeat.bed done rm *Group*.bed # Load this into the database as so ssh hgwdev cd /cluster/data/apiMel2/bed/simpleRepeat hgLoadBed -sqlTable=/cluster/home/aamp/kent/src/hg/lib/simpleRepeat.sql \ apiMel2 simpleRepeat simpleRepeat.bed # FILTER SIMPLE REPEATS (TRF) INTO MASK (DONE 2/3/2005 Andy) # make a filtered version of the trf output: # keep trf's with period <= 12: ssh kksilo cd /cluster/data/apiMel/bed/simpleRepeat awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed # MASK FA USING REPEATMASKER AND FILTERED TRF FILES (DONE 2/3/2005 Andy) ssh kksilo cd /cluster/data/apiMel2 mkdir maskedFa cd maskedFa cp ../downloads/fa/*.fa . for fa in *; do grp=${fa%.fa} egrep "^${grp}\>" ../bed/simpleRepeat/trfMask.bed > $grp.trf.bed maskOutFa -soft $fa $grp.trf.bed $grp.masked.fa maskOutFa -softAdd $grp.masked.fa ../RMOut/$grp.out $grp.masked.fa rm $fa mv $grp.masked.fa $fa done rm *.bed # STORE SEQUENCE AND ASSEMBLY INFORMATION (DONE 2/4/05 Andy) # Translate to nib ssh kksilo cd /cluster/data/apiMel2 mkdir nib for fa in maskedFa/*; do nib=${fa##*\/}; nib=nib/${nib%.fa}.nib faToNib -softMask $fa $nib done faToTwoBit maskedFa/Group?{,?}.fa apiMel2.2bit # Make links in /gbdb ssh hgwdev mkdir -p /gbdb/apiMel2/nib cd /cluster/data/apiMel2/nib for nib in *; do ln -s /cluster/data/apiMel2/nib/$nib /gbdb/apiMel2/nib/$nib done ln -s /cluster/data/apiMel2/apiMel2.2bit /gbdb/apiMel2/apiMel2.2bit # Load into database hgsql apiMel2 < ~/kent/src/hg/lib/chromInfo.sql cd ../ hgNibSeq -preMadeNib apiMel2 /gbdb/apiMel2/nib /cluster/data/apiMel2/maskedFa/Group?{,?}.fa echo "select chrom,size from chromInfo" | hgsql apiMel2 | tail +2 \ > /cluster/data/apiMel2/chrom.sizes # CREATING GRP TABLE FOR TRACK GROUPING (DONE 2/4/2005 Andy) # Copy all the data from the table "grp" # in an existing database to the new database ssh hgwdev hgsql apiMel2 -e 'create table grp (PRIMARY KEY(NAME)) select * from hg17.grp' # MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE (DONE 2/4/2005 Andy) # Warning: genome and organism fields must correspond # with defaultDb values echo 'INSERT INTO dbDb \ (name, description, nibPath, organism, \ defaultPos, active, orderKey, genome, scientificName, \ htmlPath, hgNearOk, hgPbOk, sourceName) values \ ("apiMel2", "Jan. 2005", "/gbdb/apiMel2", "A. mellifera", \ "Group10:100001-130000", 1, 57, \ "A. mellifera", \ "Apis mellifera", "/gbdb/apiMel2/html/description.html", \ 0, 0, "Baylor HGSC Amel_2.0");' \ | hgsql -N hgcentraltest echo 'update defaultDb set name="apiMel2" where genome="A. mellifera"' \ | hgsql -N hgcentraltest # Make trackDb table so browser knows what tracks to expect: ssh hgwdev cd ~/kent/src/hg/makeDb/trackDb cvs update # Edit trackDb/makefile to add apiMel1 to the DBS variable. mkdir apis/apiMel2 # Create a simple apis/apiMel2/description.html file. cvs add apis/apiMel2 cvs add apis/apiMel2/description.html make update DBS=apiMel2 ZOO_DBS= # go public on genome-test cvs ci -m "Added apiMel2 (Apis mellifera, honeybee)." makefile cvs ci -m "First-cut trackDb.ra and description.html for apiMel2 (Apis mellifera, honeybee)." apis mkdir /gbdb/apiMel1/html # in a clean, updated tree's kent/src/hg/makeDb/trackDb: make alpha # MAKE HGCENTRALTEST BLATSERVERS ENTRY (DONE 1/29/2005 Andy) ssh hgwdev echo 'insert into blatServers values("apiMel2", "blat14", "17792", 1, 0); \ insert into blatServers values("apiMel2", "blat14", "17793", 0, 1);' \ | hgsql -N hgcentraltest # MAKE DOWNLOADABLE FILES (DONE 2/6/2005 Andy) ssh kksilo cd /cluster/data/apiMel2 mkdir zips zip -j zips/chromOut.zip RMOut/Group?{,?}.out zip -j zips/chromFa.zip maskedFa/Group?{,?}.fa zip -j zips/GroupTrf.zip bed/simpleRepeat/trfMask.bed zip -j zips/GroupAgp.zip downloads/contigs/*.agp for fa in maskedFa/Group?{,?}.fa; do maskOutFa $fa hard $fa.hardMasked done zip -j zips/chromFaMasked.zip maskedFa/Group?{,?}.fa.hardMasked cd /usr/local/apache/htdocs/goldenPath/apiMel2 mkdir bigZips database # Create README.txt files in bigZips/ and database/ to explain the files. cp -p /cluster/data/apiMel2/zips/*.zip bigZips md5sum *.zip > md5sum.txt # MAKE 11.OOC FILE FOR BLAT (DONE 2/8/2005 Andy) # Use -repMatch=100 (based on size -- for human we use 1024, and # fly size is ~4.4% of human judging by gapless dm1 genome size from # featureBits -- we would use 45, but bump that up a bit to be more # conservative). ssh kkr1u00 mkdir /cluster/bluearc/apiMel2 blat /cluster/data/apiMel2/apiMel2.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=/cluster/bluearc/apiMel2/11.ooc -repMatch=100 #Wrote 39886 overused 11-mers to /cluster/bluearc/apiMel2/11.ooc cp -p /cluster/bluearc/apiMel2/*.ooc /iscratch/i/apiMel2/ iSync # PUT SEQUENCE ON /ISCRATCH FOR BLASTZ (DONE 2/8/2005 Andy) # First, agglomerate small scaffolds into chunks of ~500k median # (many scaffolds are larger than that) so we don't have too many # files for one dir, but keep a reasonably low job run time: ssh kksilo cd /cluster/data/apiMel2 ssh kkr1u00 cp -pR /cluster/data/apiMel2/faSplits /iscratch/i/apiMel2/ cp -p /cluster/data/apiMel2/apiMel2.2bit /iscratch/i/apiMel2/ cp -pR /cluster/data/apiMel2/nib /iscratch/i/apiMel2 iSync # PRODUCING GENSCAN PREDICTIONS (DONE 1/28/2005 Andy) ssh kksilo # Make hard-masked scaffolds and split up for processing: cd /cluster/data/apiMel2 mkdir hardMaskedFaSplits cd maskedFa/ for fa in *.hardMasked; do grp=${fa%.fa.hardMasked} faSplit -lift=../hardMaskedFaSplits/${grp}.lft gap $fa 1000000 ../hardMaskedFaSplits/${grp}_ done cd ../hardMaskedFaSplits cat Group?{,?}.lft > all.lft mkdir /cluster/data/apiMel2/bed/genscan cd /cluster/data/apiMel2/bed/genscan # Check out hg3rdParty/genscanlinux to get latest genscan: cvs co hg3rdParty/genscanlinux # Make 3 subdirectories for genscan to put their output files in mkdir gtf pep subopt ls -1S ../../hardMaskedFaSplits/*.fa > chunks.list cat << _EOF_ > gsub #LOOP gsBig {check in line+ \$(path1)} {check out line gtf/\$(root1).gtf} -trans={check out line pep/\$(root1).pep} -subopt={check out line subopt/\$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000 #ENDLOOP _EOF_ # << this line keeps emacs coloring happy gensub2 chunks.list single gsub jobList ssh kk cd /cluster/data/apiMel2/bed/genscan para create jobList para try para check para push #538 jobs in batch #0 jobs (including everybody's) in Parasol queue. #Checking finished jobs #Completed: 537 of 538 jobs #Crashed: 1 jobs #CPU time in finished jobs: 113471s 1891.18m 31.52h 1.31d 0.004 y #IO & Wait Time: 7324s 122.07m 2.03h 0.08d 0.000 y #Average job time: 225s 3.75m 0.06h 0.00d #Longest job: 32966s 549.43m 9.16h 0.38d #Submission to last job: 34674s 577.90m 9.63h 0.40d # If there are crashes, diagnose with "para problems". # If a job crashes due to genscan running out of memory, re-run it # manually with "-window=1200000" instead of "-window=2400000". # Wow, 1.2M and 1M crashed too. Dropped down to .6M: ssh kolossus cd /cluster/data/apiMel2/bed/genscan gsBig ../../hardMaskedFaSplits/Group9_05.fa gtf/Group9_05.gtf \ -trans=pep/Group9_05.pep -subopt=subopt/Group9_05.bed \ -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat \ -tmp=/tmp -window=600000 # Concatenate scaffold-level results: ssh kksilo cd /cluster/data/apiMel2/bed/genscan # Lift and cat things cat gtf/*.gtf > /tmp/genscan.gtf liftUp genscan.gtf ../../hardMaskedFaSplits/all.lft warn /tmp/genscan.gtf cat subopt/*.bed > /tmp/genscanSubopt.bed liftUp genscanSubopt.bed ../../hardMaskedFaSplits/all.lft warn /tmp/genscanSubopt.bed cat pep/*.pep > genscan.pep # Clean up rm -r /cluster/data/apiMel2/hardMaskedFaSplits # Change gene names around a little (errored on loading with this) #sed 's/\_[0-9Un]\{1,2\}//g' < genscan.gtf > /tmp/genscan.gtf #cp /tmp/genscan.gtf . #sed 's/\_[0-9Un]\{1,2\}//g' < genscan.pep > /tmp/genscan.pep #cp /tmp/genscan.pep . #sed 's/\_[0-9Un]\{1,2\}//g' < genscanSubopt.bed > /tmp/genscanSubopt.bed #cp /tmp/genscanSubopt.bed . # Load into the database as so: ssh hgwdev cd /cluster/data/apiMel2/bed/genscan ldHgGene -gtf apiMel2 genscan genscan.gtf hgPepPred apiMel2 generic genscanPep genscan.pep hgLoadBed apiMel2 genscanSubopt genscanSubopt.bed # GENBANK mRNA AND EST COUNTS (DONE 2/14/2005 Andy) # Go to the latest GenBank full release dir and get an idea of how # many mRNAs and ESTs there are to align. ssh eieio cd /cluster/data/genbank/data/processed/genbank.145.0/full awk '$4 == "Apis" {print $4 " " $5;}' mrna.gbidx | sort | uniq -c # 199 Apis mellifera awk '$4 == "Apis" {print $4 " " $5;}' est*.gbidx | sort | uniq -c # 24643 Apis mellifera # AUTO UPDATE GENBANK MRNA RUN (DONE 2/8/2005 Andy) ssh hgwdev # Update genbank config and source in CVS: cd ~/kent/src/hg/makeDb/genbank cvsup . # apiMel2 (A. mellifera) apiMel2.genome = /iscratch/i/apiMel2/nib/*.nib apiMel2.lift = no apiMel2.refseq.mrna.native.load = no apiMel2.genbank.mrna.xeno.load = yes apiMel2.genbank.est.xeno.load = no apiMel2.downloadDir = apiMel2 apiMel2.perChromTables = no cvs commit -m 'apiMel2 added to genbank update.' etc/genbank.conf # Edit src/align/gbBlat to add /iscratch/i/apiMel2/11.ooc cvs commit -m '' src/align/gbBlat # Install to /cluster/data/genbank: make install-server ssh `fileServer /cluster/data/genbank/` cd /cluster/data/genbank # This is an -initial run, mRNA only: nice bin/gbAlignStep -srcDb=genbank -type=mrna -initial apiMel2 & tail -f [its logfile] # Load results: ssh hgwdev cd /cluster/data/genbank nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad apiMel2 & featureBits apiMel2 all_mrna #171268 bases of 249601222 (0.069%) in intersection featureBits apiMel2 xenoMrna #7237381 bases of 249601222 (2.900%) in intersection # Clean up: rm -rf work/initial.apiMel2 # -initial for ESTs ssh eieio cd /cluster/data/genbank nice bin/gbAlignStep -srcDb=genbank -type=est -initial apiMel2 & tail -f [its logfile] # Load results: ssh hgwdev cd /cluster/data/genbank nice bin/gbDbLoadStep -verbose=1 apiMel2 & featureBits apiMel2 all_est #5809581 bases of 249601222 (2.328%) in intersection # Clean up: rm -rf work/initial.apiMel2 # SWAP DM2-APIMEL2 BLASTZ (DONE 2/9/2005 Andy) # OBSOLETED 1/13/06 angie, see SWAP/CHAIN/NET DM2 below... # CHAIN MELANOGASTER BLASTZ (DONE 2/9/2005) # OBSOLETED 1/13/06 angie, see SWAP/CHAIN/NET DM2 below... # NET MELANOGASTER BLASTZ (DONE 2/9/2005 Andy) # OBSOLETED 1/13/06 angie, see SWAP/CHAIN/NET DM2 below... # MAKE AXTNET (DONE 2/10/2005 Andy) # OBSOLETED 1/13/06 angie, see SWAP/CHAIN/NET DM2 below... # MAKE VSDM2 DOWNLOADABLES (DONE 2/10/2005 Andy) # OBSOLETED 1/13/06 angie, see SWAP/CHAIN/NET DM2 below... # MAKE Drosophila Proteins track (DONE 2005-02-24 braney) ssh kksilo mkdir -p /cluster/data/apiMel2/blastDb cd /cluster/data/apiMel2/blastDb ln -s /cluster/data/apiMel2/fa*/*/*.fa . for i in *.fa; do formatdb -i $i -p F 2> /dev/null; done rm *.fa *.log ssh kkr1u00 mkdir -p /iscratch/i/apiMel2/blastDb cd /cluster/data/apiMel2/blastDb cp * /iscratch/i/apiMel2/blastDb iSync > sync.out mkdir -p /cluster/data/apiMel2/bed/tblastn.dm1FB cd /cluster/data/apiMel2/bed/tblastn.dm1FB ls -1S /iscratch/i/apiMel2/blastDb/*.nsq | sed "s/\.nsq//" > bug.lst exit # back to kksilo cd /cluster/data/apiMel2/bed/tblastn.dm1FB mkdir fbfa # calculate a reasonable number of jobs calc `wc /cluster/data/dm1/bed/blat.dm1FB/dm1FB.psl | awk "{print \\\$1}"`/\(150000/`wc bug.lst | awk "{print \\\$1}"`\) # 18735/(150000/1077) = 134.517300 split -l 135 /cluster/data/dm1/bed/blat.dm1FB/dm1FB.psl fbfa/fb cd fbfa for i in *; do pslxToFa $i $i.fa; rm $i; done cd .. ls -1S fbfa/*.fa > fb.lst mkdir blastOut for i in `cat fb.lst`; do mkdir blastOut/`basename $i .fa`; done cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/iscratch/i/blast/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/apiMel2/jkStuff/liftAll.lft warn $f.2 liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /iscratch/i/dm1/protein.lft warn $f.3 mv $3.tmp $3 rm -f $f.1 $f.2 $f.3 exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.3 exit 1 '_EOF_' chmod +x blastSome gensub2 bug.lst fb.lst blastGsub blastSpec ssh kk cd /cluster/data/apiMel2/bed/tblastn.dm1FB para create blastSpec para try, push # Completed: 149703 of 149703 jobs # CPU time in finished jobs: 4747336s 79122.27m 1318.70h 54.95d 0.151 y # IO & Wait Time: 1196093s 19934.88m 332.25h 13.84d 0.038 y # Average job time: 40s 0.66m 0.01h 0.00d # Longest job: 221s 3.68m 0.06h 0.00d # Submission to last job: 7580s 126.33m 2.11h 0.09d cat << '_EOF_' > chainGsub #LOOP chainSome $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainSome (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=25000 stdin ../c.`basename $1`.psl) '_EOF_' chmod +x chainSome ls -1dS `pwd`/blastOut/fb?? > chain.lst gensub2 chain.lst single chainGsub chainSpec para create chainSpec # should run this on the mini-cluster or with my shove script # so you can limit the number of jobs starting to 3 or 4 para try, push... # Completed: 139 of 139 jobs # CPU time in finished jobs: 3259s 54.32m 0.91h 0.04d 0.000 y # IO & Wait Time: 3745s 62.41m 1.04h 0.04d 0.000 y # Average job time: 50s 0.84m 0.01h 0.00d # Longest job: 930s 15.50m 0.26h 0.01d # Submission to last job: 1580s 26.33m 0.44h 0.02d exit # back to kksilo cd /cluster/data/apiMel2/bed/tblastn.dm1FB/blastOut for i in fb?? do awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.psl > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/apiMel2/bed/tblastn.dm1FB/blastDm1FB.psl ssh hgwdev cd /cluster/data/apiMel2/bed/tblastn.dm1FB hgLoadPsl apiMel2 blastDm1FB.psl #once everything checks out, rm the temp dir rm -rf blastOut # End tblastn # MAKE SCAFFOLDS TRACK (DONE 2/23/2005 Andy) # NOTE: all intermediate files from this run were immediately cleaned up. # To see more details about the alignments, see the scaffolds2 directory # (RE-ALIGN SCAFFOLDS below). ssh kksilo cd /cluster/data/apiMel2/bed mkdir scaffolds cd scaffolds for g in `seq 1 16` Un; do wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Amellifera/fasta/Amel20050120-freeze/linearScaffolds/Group${g}_20050120.fa.gz; gunzip Group${g}_20050120.fa.gz nice blat -fastMap -maxIntron=0 -dots=1000000 ../../apiMel2.2bit:Group$g -ooc=../../11.ooc Group${g}_20050120.fa Group${g}.psl perfectBlatBed4 Group${g}.psl Group${g}.scaffolds.bed done # Fix up the Group10, and GroupUn ones by hand. Checking line counts for each bed file (Y1) with GroupX.Y2 last record, # Y1 should = Y2. GroupUn had two full-length hits to GroupUn.1669, so I deleted the one that didn't seem to jive. rm *.fa *.psl cat *.bed >> scaffolds.bed rm Group*.bed sed 's/gnl|.*|//' scaffolds.bed > tmp mv tmp scaffolds.bed hgLoadBed apiMel2 scaffolds scaffolds.bed # trackDb.ra entry: track scaffolds shortLabel Scaffolds longLabel Assembly from Scaffolds group map priority 9.5 visibility hide color 99,66,20 altColor 199,147,35 type bed 4 . # The description needs some editing. # MAKE Human Proteins track (hg17 DONE 2005-03-26 braney) ssh kolossus mkdir -p /cluster/data/apiMel2/blastDb cd /cluster/data/apiMel2/blastDb ln -s /cluster/data/apiMel2/faSplits/*/*.fa . for i in *.fa do formatdb -i $i -p F done rm *.log *.fa ssh kkr1u00 mkdir -p /iscratch/i/apiMel2/blastDb cd /cluster/data/apiMel2/blastDb cp * /iscratch/i/apiMel2/blastDb iSync > sync.out exit # back to kolossus mkdir -p /cluster/data/apiMel2/bed/tblastn.hg17KG cd /cluster/data/apiMel2/bed/tblastn.hg17KG ls -1S /iscratch/i/apiMel2/blastDb/*.nsq | sed "s/\.nsq//" > query.lst calc `wc /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl | awk "{print \\\$1}"`/\(350000/`wc query.lst | awk "{print \\\$1}"`\) # 42156/(350000/1077) = 129.720034 mkdir -p /cluster/bluearc/apiMel2/bed/tblastn.hg17KG/kgfa split -l 130 /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl /cluster/bluearc/apiMel2/bed/tblastn.hg17KG/kgfa/kg ln -s /cluster/bluearc/apiMel2/bed/tblastn.hg17KG/kgfa kgfa cd kgfa for i in *; do pslxToFa $i $i.fa; rm $i; done cd .. ls -1S kgfa/*.fa > kg.lst mkdir -p /cluster/bluearc/apiMel2/bed/tblastn.hg17KG/blastOut ln -s /cluster/bluearc/apiMel2/bed/tblastn.hg17KG/blastOut for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done tcsh cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/iscratch/i/blast/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/apiMel2/jkStuff/liftAll.lft warn $f.2 liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg17/bed/blat.hg17KG/protein.lft warn $f.3 if pslCheck -prot $3.tmp then mv $3.tmp $3 rm -f $f.1 $f.2 fi exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 exit 1 '_EOF_' chmod +x blastSome gensub2 query.lst kg.lst blastGsub blastSpec ssh kk cd /cluster/data/apiMel2/bed/tblastn.hg17KG para create blastSpec para push # Completed: 350025 of 350025 jobs # CPU time in finished jobs: 18368399s 306139.99m 5102.33h 212.60d 0.582 y # IO & Wait Time: 8819533s 146992.21m 2449.87h 102.08d 0.280 y # Average job time: 78s 1.29m 0.02h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 5725s 95.42m 1.59h 0.07d # Submission to last job: 37495s 624.92m 10.42h 0.43d cat << '_EOF_' > chainGsub #LOOP chainSome $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainSome (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=25000 stdin ../c.`basename $1`.psl) '_EOF_' chmod +x chainSome ls -1dS `pwd`/blastOut/kg?? > chain.lst gensub2 chain.lst single chainGsub chainSpec ssh kki cd /cluster/data/apiMel2/bed/tblastn.hg17KG para create chainSpec para push # Completed: 325 of 325 jobs # CPU time in finished jobs: 9334s 155.56m 2.59h 0.11d 0.000 y # IO & Wait Time: 25507s 425.12m 7.09h 0.30d 0.001 y # Average job time: 107s 1.79m 0.03h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 2999s 49.98m 0.83h 0.03d # Submission to last job: 3888s 64.80m 1.08h 0.04d exit # back to kolossus cd /cluster/data/apiMel2/bed/tblastn.hg17KG/blastOut for i in kg?? do awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.psl > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done cat u.*.psl m60* | sort -T /tmp -k 14,14 -k 17,17n -k 17,17n | uniq > /cluster/data/apiMel2/bed/tblastn.hg17KG/blastHg17KG.psl cd .. ssh hgwdev cd /cluster/data/apiMel2/bed/tblastn.hg17KG hgLoadPsl apiMel2 blastHg17KG.psl # back to kolossus rm -rf blastOut # End tblastn # RE-ALIGN SCAFFOLDS (DONE 3/30/05 angie) # Andy cleaned up so well after his scaffold alignment that all of the # strand info is gone, but I need that to make the liftUp file below... # would be good to include it in the track as well, if there are both # + and - alignments. ssh kolossus mkdir /cluster/data/apiMel2/bed/scaffolds2 cd /cluster/data/apiMel2/bed/scaffolds2 foreach g (`seq 1 16` Un) wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Amellifera/fasta/Amel20050120-freeze/linearScaffolds/Group${g}_20050120.fa.gz end faSize *.fa.gz #228567597 bases (3817074 N's 224750523 real 224750523 upper 0 lower) in 15293 sequences in 17 files #Total size: mean 14945.9 sd 83937.6 min 0 (?35000) max 2209875 (gnl|Amel_2.0|Group12.11) median 771 # Have to uncompress for blat. gunzip *.gz ssh kki cd /cluster/data/apiMel2/bed/scaffolds2 mkdir psl ls -1S *.fa | sed -e 's/_20050120.fa//' > group.lst cat > gsub < gsub < edit out the former. # GroupUn.5526 and .1699 have imperfect mappings that slipped in # under the pslReps wire -- edit out the ones with multiple blocks. # Having whittled down the psl to one perfect alignment per scaffold, # make bed 6 to preserve the strand info. awk '{printf "%s\t%d\t%d\t%s\t0\t%s\n", $14, $16, $17, $10, $9;}' \ pslReps/*.psl \ | sed 's/gnl|Amel_2.0|//' > scaffolds.bed # Compare coordinates to Andy's bed4 from previous run: sort ../scaffolds/scaffolds.bed > /tmp/1 awk '{print $1 "\t" $2 "\t" $3 "\t" $4;}' scaffolds.bed | sort > /tmp/2 diff /tmp/1 /tmp/2 | wc -l #0 # OK, now after all that fuss, are there any - strand alignments? awk '{print $6;}' scaffolds.bed | sort | uniq #+ # No! So no need to reload the scaffolds track. However, we can # proceed with confidence to make a strandless liftUp file. # (liftUp doesn't support all formats when there are - strand items) # Cleanup: gzip *.fa psl/*.psl rm *.tab # MAKE SCAFFOLDS -> GROUPS LIFTUP FILE (DONE 3/30/05 angie) ssh kolossus cd /cluster/data/apiMel2/bed/scaffolds2 cp /dev/null ../../jkStuff/scaffolds.lft foreach g (`awk '{print $1;}' ../../chrom.sizes`) set size = `awk '$1 == "'$g'" {print $2;}' ../../chrom.sizes` echo $g $size awk '$1 == "'$g'" {printf "%d\t%s\t%d\t%s\t%d\n", $2, $4, ($3 - $2), $1, '$size';}' \ scaffolds.bed >> ../../jkStuff/scaffolds.lft end wc -l scaffolds.bed ../../jkStuff/scaffolds.lft # 7655 scaffolds.bed # 7655 ../../jkStuff/scaffolds.lft # HOMOLOGY GENE PREDICTIONS FROM ZDOBNOV (DONE 5/13/05 angie) # Updated data -- GFF loads cleanly now, and is split into high- # and low-confidence sets. ssh hgwdev mkdir /cluster/data/apiMel2/bed/brhInparalog cd /cluster/data/apiMel2/bed/brhInparalog wget http://azra.embl-heidelberg.de/~zdobnov/Bee2/amel_set1.gff.txt wget http://azra.embl-heidelberg.de/~zdobnov/Bee2/amel_set2.gff.txt # Discard header line, change "exon" to "CDS", and lift up from # scaffolds to group coords: tail +2 amel_set1.gff.txt | sed -e 's/exon/CDS/' \ | liftUp brhInparalog.gff ../../jkStuff/scaffolds.lft warn stdin tail +2 amel_set2.gff.txt | sed -e 's/exon/CDS/' \ | liftUp brhInparalogLow.gff ../../jkStuff/scaffolds.lft warn stdin ldHgGene apiMel2 brhInparalog brhInparalog.gff ldHgGene apiMel2 brhInparalogLow brhInparalogLow.gff # ENSEMBL GENES (DONE 4/12/05 angie) ssh hgwdev mkdir /cluster/data/apiMel2/bed/ensGene cd /cluster/data/apiMel2/bed/ensGene wget ftp://ftp.ensembl.org/pub/misc/vvi/ensembl_genes_8Apr05.tar.gz wget ftp://ftp.ensembl.org/pub/misc/vvi/ensembl_translations8Apr05.fa.gz tar xvzf ensembl_genes_8Apr05.tar.gz zcat ensembl_translations8Apr05.fa.gz \ | perl -wpe 's/^>\S+ Translation id (ENSAPMT\d+) .*/>$1/' \ > ensPep.fa ldHgGene -gtf -genePredExt apiMel2 ensGene gtf_dump* hgPepPred apiMel2 generic ensPep ensPep.fa # NCBI ANNOTATIONS (XM_* PREDICTED REFSEQS) (REDONE 5/26/05 angie) # Originally done 5/17; reloaded w/new data from Barbara Ruef @ncbi 5/26. ssh hgwdev mkdir /cluster/data/apiMel2/bed/ncbi cd /cluster/data/apiMel2/bed/ncbi wget ftp://ftp.ncbi.nlm.nih.gov/genomes/Apis_mellifera/special_requests/Amel2.0_Scaffolds_NCBI_BCM.map wget ftp://ftp.ncbi.nlm.nih.gov/genomes/Apis_mellifera/special_requests/NCBI.gff.gz wget ftp://ftp.ncbi.nlm.nih.gov/genomes/Apis_mellifera/special_requests/NCBI-mrna.fa.gz wget ftp://ftp.ncbi.nlm.nih.gov/genomes/Apis_mellifera/special_requests/NCBI-prot.fa.gz # NCBI has their own scaffold IDs... make a liftUp spec file from # scaffolds.lft and their mapping. perl -e 'open(MAP, "Amel2.0_Scaffolds_NCBI_BCM.map") || die doh; \ %b2n = (); \ while () { \ chomp; my ($ncbi, undef, $baylor) = split; \ $b2n{$baylor} = $ncbi; \ } \ while (<>) { \ my @words = split; \ next if (! exists $b2n{$words[1]}); \ $words[1] = $b2n{$words[1]}; \ print join("\t", @words) . "\n"; \ }' ../../jkStuff/scaffolds.lft > ../../jkStuff/ncbiScaffolds.lft liftUp -type=.gff apiMel2.RefSeq.gff ../../jkStuff/ncbiScaffolds.lft warn \ NCBI.gff.gz # Some warnings about NC_001566.1, the mitochondrion, which wasn't # included in apiMel2 -- OK. # At first I wasn't sure if we wanted to load this up, so I made it # available as a custom track file # http://genome-test.cse.ucsc.edu/angie/apiMel2.RefSeq.gff ln -s /cluster/data/apiMel2/bed/ncbi/apiMel2.RefSeq.gff \ /usr/local/apache/htdocs/angie/ # But Jim said that we should show these, so load it up: ldHgGene -gtf -genePredExt apiMel2 modelRefGene apiMel2.RefSeq.gff # The proteins XP_* don't seem to correspond to the XM_* genes with # the same numbers, so don't load them up -- the predicted proteins # are straight from genomic sequence anyway. # EXONERATE TRACK (IN PROGRESS 5/23/2005 Andy) ssh kk mkdir /cluster/data/apiMel2/bed/exonerateDm2 cd /cluster/data/apiMel2/bed/exonerateDm2 cat << EOF > exonerate.sh #!/bin/bash dirGroup=\`dirname \$2\` group=\`basename \$dirGroup\` gSplit=\`basename \$2 .fa\` pSplit=\`basename \$1 .fa\` outFile=\${gSplit}_\${pSplit}.gff alias exonerate=/cluster/home/aamp/bin/i386/exonerate exonerate --model protein2genome --percent 50 --showvulgar false --showalignment false --showtargetgff true \$1 \$2 | grep -v "^Command" | grep -v "^Hostname" | grep -v "^#" | grep -v "^--" > /tmp/\$outFile mkdir -p /cluster/bluearc/apiMel2/exonerateOutput/\$group cp /tmp/\$outFile \$3 rm /tmp/\$outFile EOF find /iscratch/i/apiMel2/faSplits/ -name '*.fa' > honey.lst mkdir /santest/scratch/dm2 pushd /santest/scratch/dm2 cp /cluster/data/dm2/bed/flybase/flybasePep.fa . faSplit sequence flybasePep.fa 25 fly_ rm flybasePep.fa popd find /santest/scratch/dm2 -name '*.fa' > fly.lst cat << EOF > gsub #LOOP ./exonerate.sh {check in line+ \$(path2)} {check in line+ \$(path1)} {check out exists /cluster/bluearc/apiMel2/exonerateOutput/\$(lastDir1)/\$(root1)_\$(root2).gff} #ENDLOOP EOF gensub2 honey.lst fly.lst gsub spec para create spec para try para check #19386 jobs in batch #0 jobs (including everybody's) in Parasol queue. #Checking finished jobs #Completed: 19386 of 19386 jobs #CPU time in finished jobs: 1349378s 22489.63m 374.83h 15.62d 0.043 y #IO & Wait Time: 192915s 3215.25m 53.59h 2.23d 0.006 y #Average job time: 80s 1.33m 0.02h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 7474s 124.57m 2.08h 0.09d #Submission to last job: 14456s 240.93m 4.02h 0.17d ssh hgwdev cd /cluster/data/apiMel2/bed/exonerateDm2 find /panasas/store/apiMel2/exonerateOutput -name "*.gff" -size +0b -exec cat '{}' ';' > dm2.gff ssh kk cd /cluster/data/apiMel2/bed/exonerateDm2 cat /iscratch/i/apiMel2/faSplits/*.lft > liftAll.lft ssh hgwdev cd /cluster/data/apiMel2/bed/exonerateDm2 liftUp fly.gff liftAll.lft warn dm2.gff ldHgGene apiMel2 exonerateDm2 fly.gff #Reading fly.gff #Read 0 transcripts in 52886 lines in 1 files # 0 groups 17 seqs 1 sources 6 feature types #0 gene predictions # Yeah that's no good. # MAKE Drosophila Proteins track (DONE 06-30-05 braney) ssh kk mkdir -p /cluster/data/apiMel2/bed/tblastn.dm2FB cd /cluster/data/apiMel2/bed/tblastn.dm2FB ls -1S /iscratch/i/apiMel2/blastDb/*.nsq | sed "s/\.nsq//" > target.lst mkdir fbfa # calculate a reasonable number of jobs calc `wc /cluster/data/dm2/bed/blat.dm2FB/dm2FB.psl | awk "{print \\\$1}"`/\(164630/`wc target.lst | awk "{print \\\$1}"`\) # 18929/(164630/1077) = 123.832430 split -l 124 /cluster/data/dm2/bed/blat.dm2FB/dm2FB.psl fbfa/fb cd fbfa for i in *; do pslxToFa $i $i.fa; rm $i; done cd .. ls -1S fbfa/*.fa > fb.lst mkdir -p /cluster/bluearc/apiMel2/bed/tblastn.dm2FB/blastOut ln -s /cluster/bluearc/apiMel2/bed/tblastn.dm2FB/blastOut for i in `cat fb.lst`; do mkdir blastOut/`basename $i .fa`; done cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/iscratch/i/blast/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/apiMel2/jkStuff/liftAll.lft warn $f.2 liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/dm2/bed/blat.dm2FB/protein.lft warn $f.3 mv $3.tmp $3 rm -f $f.1 $f.2 $f.3 exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.3 $f.8 exit 1 '_EOF_' chmod +x blastSome gensub2 target.lst fb.lst blastGsub blastSpec ssh kk cd /cluster/data/apiMel2/bed/tblastn.dm2FB para create blastSpec para push # Completed: 164781 of 164781 jobs # CPU time in finished jobs: 2133855s 35564.25m 592.74h 24.70d 0.068 y # IO & Wait Time: 1168861s 19481.02m 324.68h 13.53d 0.037 y # Average job time: 20s 0.33m 0.01h 0.00d # Longest finished job: 276s 4.60m 0.08h 0.00d # Submission to last job: 39379s 656.32m 10.94h 0.46d exit # back to kksilo cat << '_EOF_' > chainGsub #LOOP chainSome $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainSome (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=25000 stdin ../c.`basename $1`.psl) '_EOF_' chmod +x chainSome ls -1dS `pwd`/blastOut/fb?? > chain.lst gensub2 chain.lst single chainGsub chainSpec ssh kki cd /cluster/data/apiMel2/bed/tblastn.dm2FB para create chainSpec para push # Completed: 153 of 153 jobs # CPU time in finished jobs: 8279s 137.98m 2.30h 0.10d 0.000 y # IO & Wait Time: 4933s 82.22m 1.37h 0.06d 0.000 y # Average job time: 86s 1.44m 0.02h 0.00d # Longest finished job: 3445s 57.42m 0.96h 0.04d # Submission to last job: 3823s 63.72m 1.06h 0.04d cd /cluster/data/apiMel2/bed/tblastn.dm2FB/blastOut for i in fb?? do awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.psl > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done sort -u -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* > /cluster/data/apiMel2/bed/tblastn.dm2FB/blastDm2FB.psl cd .. ssh hgwdev cd /cluster/data/apiMel2/bed/tblastn.dm2FB hgLoadPsl apiMel2 blastDm2FB.psl exit # back to kksilo rm -rf blastOut # End tblastn # MAKE Human Proteins track (DONE 06-30-05 braney) mkdir -p /cluster/data/apiMel2/bed/tblastn.hg17KG cd /cluster/data/apiMel2/bed/tblastn.hg17KG ls -1S /iscratch/i/apiMel2/blastDb/*.nsq | sed "s/\.nsq//" > target.lst mkdir kgfa # calculate a reasonable number of jobs calc `wc /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl | awk "{print \\\$1}"`/\(150000/`wc target.lst | awk "{print \\\$1}"`\) # 37365/(150000/1077) = 268.280700 split -l 268 /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl kgfa/kg cd kgfa for i in *; do pslxToFa $i $i.fa; rm $i; done cd .. ls -1S kgfa/*.fa > kg.lst rm -rf /cluster/bluearc/apiMel2/bed/tblastn.hg17KG/blastOut mkdir -p /cluster/bluearc/apiMel2/bed/tblastn.hg17KG/blastOut ln -s /cluster/bluearc/apiMel2/bed/tblastn.hg17KG/blastOut for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done tcsh cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/iscratch/i/blast/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/apiMel2/jkStuff/liftAll.lft warn $f.2 liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg17/bed/blat.hg17KG/protein.lft warn $f.3 mv $3.tmp $3 rm -f $f.1 $f.2 exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 exit 1 '_EOF_' chmod +x blastSome gensub2 target.lst kg.lst blastGsub blastSpec ssh kk cd /cluster/data/apiMel2/bed/tblastn.hg17KG para create blastSpec para push # Completed: 150780 of 150780 jobs # CPU time in finished jobs: 14906565s 248442.75m 4140.71h 172.53d 0.473 y # IO & Wait Time: 3654875s 60914.58m 1015.24h 42.30d 0.116 y # Average job time: 123s 2.05m 0.03h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 610s 10.17m 0.17h 0.01d # Submission to last job: 122371s 2039.52m 33.99h 1.42d cat << '_EOF_' > chainGsub #LOOP chainSome $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainSome (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=25000 stdin ../c.`basename $1`.psl) '_EOF_' chmod +x chainSome ls -1dS `pwd`/blastOut/kg?? > chain.lst gensub2 chain.lst single chainGsub chainSpec ssh kki cd /cluster/data/apiMel2/bed/tblastn.hg17KG para create chainSpec para push # Completed: 140 of 140 jobs # CPU time in finished jobs: 5416s 90.27m 1.50h 0.06d 0.000 y # IO & Wait Time: 11122s 185.37m 3.09h 0.13d 0.000 y # Average job time: 118s 1.97m 0.03h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 1427s 23.78m 0.40h 0.02d # Submission to last job: 1696s 28.27m 0.47h 0.02d ssh kkstore01 cd /cluster/data/apiMel2/bed/tblastn.hg17KG/blastOut for i in kg?? do awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.psl > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done sort -u -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* > /cluster/data/apiMel2/bed/tblastn.hg17KG/blastHg17KG.psl cd .. ssh hgwdev cd /cluster/data/apiMel2/bed/tblastn.hg17KG hgLoadPsl apiMel2 blastHg17KG.psl exit # back to kksilo rm -rf blastOut # End tblastn # BEEBASE GENE MODELS (DONE 7/27/05 angie) mkdir /cluster/data/apiMel2/bed/beeBase cd /cluster/data/apiMel2/bed/beeBase set bbpassword = # emailed from Chris Elsik at TAMU wget ftp://guest:$bbpassword@trixie.tamu.edu/official_gene_sets/Amel_abinitio.gff wget ftp://guest:$bbpassword@trixie.tamu.edu/official_gene_sets/Amel_predicted_glean3.gff sed -e 's/gene_id //' Amel_abinitio.gff \ | liftUp -type=.gff stdout ../../jkStuff/scaffolds.lft warn stdin \ | ldHgGene apiMel2 beeBaseAbinitio stdin perl -wpe 's/GenePrediction (\w+-\w+).*/$1/' Amel_predicted_glean3.gff \ | liftUp -type=.gff stdout ../../jkStuff/scaffolds.lft warn stdin \ | ldHgGene apiMel2 beeBaseGlean3 stdin # Make a custom track for Hugh Robertson and Chris to use in the meantime # (until they can send more complete descriptions). echo 'track name=beeBaseGlean3 description="BeeBase GLEAN3 Gene Models" color=175,175,0 grp=genes priority=41 visibility=pack' > beeBaseCustomTracks.txt perl -wpe 's/GenePrediction (\w+-\w+).*/$1/' Amel_predicted_glean3.gff \ | liftUp -type=.gff stdout ../../jkStuff/scaffolds.lft warn stdin \ >> beeBaseCustomTracks.txt echo 'track name=beeBaseAbinitio description="BeeBase Ab Initio Gene Models" color=150,120,10 grp=genes priority=42 visibility=pack' >> beeBaseCustomTracks.txt sed -e 's/gene_id //' Amel_abinitio.gff \ | liftUp -type=.gff stdout ../../jkStuff/scaffolds.lft warn stdin \ >> beeBaseCustomTracks.txt cp beeBaseCustomTracks.txt /usr/local/apache/htdocs/angie/ # GENEID PREDICTIONS (DONE 1/4/06 angie) ssh hgwdev mkdir /cluster/data/apiMel2/bed/geneid cd /cluster/data/apiMel2/bed/geneid foreach chr (`awk '{print $1;}' /cluster/data/apiMel2/chrom.sizes`) wget http://genome.imim.es/genepredictions/A.mellifera/golden_path_200501/geneidv1.2/$chr.gtf end ldHgGene -gtf -genePredExt apiMel2 geneid *.gtf # SWAP/CHAIN/NET DM2 (DONE 1/13/06 angie) # dm2-apiMel2 was redone with better parameters in May '05... # pick those up and also get the latest file conventions. ssh hgwdev rm /cluster/data/apiMel2/bed/blastz.dm2 mkdir /cluster/data/apiMel2/bed/blastz.dm2.swap cd /cluster/data/apiMel2/bed/blastz.dm2.swap doBlastzChainNet.pl -swap /cluster/data/dm2/bed/blastz.apiMel2/DEF >& do.log & tail -f do.log ln -s blastz.dm2.swap /cluster/data/apiMel2/bed/blastz.dm2