# for emacs: -*- mode: sh; -*- # # RED FLOUR BEETLE!!! # Tribolium castaneum -- 2.0 assembly Oct 10, 2005 # # ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Tcastaneum/ # GET SEQUENCE / SET UP DIRS (DONE Andy 10/22/2005) ssh hgwdev cd /cluster/store9 mkdir triCas2 cd /cluster/data ln -s /cluster/store9/triCas2 triCas2 cd triCas2 mkdir downloads bed cd downloads/ wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Tcastaneum/Tcas2.0/contigs/Tcas20050914-contigs wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Tcastaneum/Tcas2.0/contigs/Tcas20050914.agp sed 's/#.*//' Tcas20050914.agp > beetle.agp sed 's/^>gi|[0-9]\+|gb|\([0-9A-Z]\+\)\.1|.*/>\1/' Tcas20050914-contigs > contigs.fa agpAllToFaFile beetle.agp contigs.fa beetle.fa faSize beetle.fa #199682416 bases (48132453 N's 151549963 real 151549963 upper 0 lower) in 2211 sequences in 1 files #Total size: mean 90313.2 sd 1137291.7 min 248 (singleUn_1093) max 32080666 (ChLG3) median 1526 #N count: mean 21769.5 sd 327170.2 #U count: mean 68543.6 sd 816613.0 #L count: mean 0.0 sd 0.0 # CREATING DATABASE (DONE 10/22/2005 Andy) ssh hgwdev hgsql '' -e 'create database triCas2' # LOAD GAP & GOLD TABLES FROM AGP (DONE 10/22/2005 Andy) ssh hgwdev cd /cluster/data/triCas2/downloads hgGoldGapGl -noGl triCas2 beetle.agp hgsql triCas2 -e 'analyze table gold; analyze table gap;' # REPEATMASKER (DONE 10/22/2005 Andy) ssh pk mkdir -p /SAN/rmsk/{splits,run,out} cd /SAN faSplit -outDirDepth=1 -lift=500kSplits/splits.lft gap /cluster/data/triCas2/downloads/beetle.fa 500000 500kSplits/beet_ cd rmsk/ ln -s ../500kSplits splits cd run/ cat > rmsk.sh << "EOF" #!/bin/bash fa=`basename $1` param=-spec Tenebrionoidea pushd /scratch oldDir=`dirs +1` cp $oldDir/$1 . /cluster/bluearc/RepeatMasker/RepeatMasker $param $fa mkdir -p $oldDir/`dirname $2` cp $fa.out $oldDir/$2 rm $fa* popd EOF chmod +x rmsk.sh cat > gsub << "EOF" #LOOP ./rmsk.sh {check in line+ $(path1)} {check out exists ../out/$(lastDir)/$(file1).out} #ENDLOOP EOF find ../splits/ -name '*.fa' > fa.lst gensub2 fa.lst single gsub spec para create spec para try para push para time #Completed: 2572 of 2572 jobs #CPU time in finished jobs: 610301s 10171.69m 169.53h 7.06d 0.019 y #IO & Wait Time: 61073s 1017.88m 16.96h 0.71d 0.002 y #Average job time: 261s 4.35m 0.07h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 3063s 51.05m 0.85h 0.04d #Submission to last job: 6465s 107.75m 1.80h 0.07d cd ../ head -n 3 out/0/beet_010.fa.out > head find out/ -name "*.fa.out" -type f -size +80c -exec tail +4 '{}' ';' > tmp cat head tmp > tmp2 liftUp rmsk.out splits/splits.lft warn tmp2 rm head tmp tmp2 ssh hgwdev cd /SAN/rmsk hgLoadOut triCas2 rmsk.out hgsql triCas2 -e 'rename table rmsk_rmsk to rmsk' hgsql triCas2 -e 'drop index bin on rmsk; \ drop index genoStart on rmsk; \ drop index genoEnd on rmsk; \ create index bin on rmsk (genoName(11), bin); \ create index genoStart on rmsk (genoName(11), genoStart); \ create index genoEnd on rmsk (genoName(11), genoEnd);' rm splits cd ../ cp -r rmsk/ /cluster/data/triCas2/bed # SIMPLE REPEATS (TRF) (DONE 10/22/2005 Andy) ssh pk cd /SAN mkdir -p simpleRepeat/{run,out} cd simpleRepeat/ ln -s ../500kSplits splits cd run/ cat << "_EOF_" > gsub #LOOP ./trfBig.sh {check in line+ $(path1)} {check out line+ ../out/$(lastDir1)/$(root1).bed} #ENDLOOP _EOF_ cat << "_EOF_" > trfBig.sh #!/bin/bash outDir=`dirname $2` outName=`basename $2` inName=`basename $1` mkdir -p $outDir cp $1 /scratch pushd /scratch trfBig $inName $outName -bed -tempDir=/tmp > /dev/null popd cp /scratch/$outName $2 rm /scratch/$outName /scratch/$inName _EOF_ chmod +x trfBig.sh find ../splits/ -name '*.fa' > fa.lst gensub2 fa.lst single gsub spec para create spec para try para push para time #Completed: 2572 of 2572 jobs #CPU time in finished jobs: 2137s 35.62m 0.59h 0.02d 0.000 y #IO & Wait Time: 6493s 108.22m 1.80h 0.08d 0.000 y #Average job time: 3s 0.06m 0.00h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 475s 7.92m 0.13h 0.01d #Submission to last job: 908s 15.13m 0.25h 0.01d cd ../ find out/ -name "*.bed" -type f -exec grep "^beet" '{}' ';' > tmp liftUp trf.bed splits/splits.lft warn tmp rm tmp ssh hgwdev cd SAN/simpleRepeat/ hgLoadBed -sqlTable=/cluster/home/aamp/kent/src/hg/lib/simpleRepeat.sql \ triCas2 simpleRepeat trf.bed rm splits cd ../ cp -r simpleRepeat/ /cluster/data/triCas2/bed # MAKE MASKED 2BIT (DONE 10/22/2005 Andy) # make a filtered version of the trf output keep trf's with period <= 12: ssh hgwdev cd /cluster/data/triCas2 awk '{if ($5 <= 12) print;}' bed/simpleRepeat/trf.bed > trfMask.bed maskOutFa -soft downloads/beetle.fa trfMask.bed tmp maskOutFa -softAdd tmp bed/rmsk/rmsk.out beetle.masked.fa faToTwoBit beetle.masked.fa triCas2.2bit hgsql triCas2 < ~/kent/src/hg/lib/chromInfo.sql twoBitInfo triCas2.2bit /dev/stdout | awk '{printf("%s\t%s\t/gbdb/triCas2/triCas2.2bit\n", $1, $2)}' > chrom.sizes echo "load data local infile 'chrom.sizes' into table chromInfo;" | hgsql triCas2 # CREATING GRP TABLE FOR TRACK GROUPING (DONE 10/22/2005 Andy) # Copy all the data from the table "grp" in an existing database to the new database ssh hgwdev hgsql triCas2 -e 'create table grp (PRIMARY KEY(NAME)) select * from hg17.grp' # SOME GBDB STUFF (DONE 10/22/2005 Andy) ssh hgwdev mkdir -p /gbdb/triCas2 mkdir /cluster/data/triCas2/html ln -s /cluster/data/triCas2/html /gbdb/triCas2/html ln -s /cluster/data/triCas2/triCas2.2bit /gbdb/triCas2/triCas2.2bit # MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE (DONE 10/22/2005 Andy) ssh hgwdev hgsql -N hgcentraltest -e 'INSERT INTO dbDb \ (name, description, nibPath, organism, \ defaultPos, active, orderKey, genome, scientificName, \ htmlPath, hgNearOk, hgPbOk, sourceName) values \ ("triCas2", "Sept. 2005", "/gbdb/triCas2", "T. castaneum", \ "ChLG4:2220000-2245000", 1, 58, \ "T. castaneum", \ "Tribolium castaneum", "/gbdb/triCas2/html/description.html", \ 0, 0, "Baylor");' hgsql -N hgcentraltest -e 'delete from defaultDb where name="triCas1"' hgsql -N hgcentraltest -e 'insert into defaultDb (genome, name) values ("T. castaneum", "triCas2");' hgsql -N hgcentraltest -e 'insert into genomeClade (genome, clade, priority) values ("T. castaneum", "insect", "100");' cd ~/kent/src/hg/makeDb/trackDb cvs update # Edit trackDb/makefile to add triCas2 to the DBS variable. mkdir -p beetle/triCas2 touch beetle/triCas2/description.html cvs add beetle/triCas2 cvs add beetle/triCas2/description.html cvs ci -m "Added triCas2 (red flour beetle)." makefile cvs ci -m "Empty initial trackDb.ra and description.html for triCas2" beetle # in a clean, updated tree's kent/src/hg/makeDb/trackDb: make alpha # FIX CHROMOSOME NAMES (DONE 10/22/2005 Andy) ssh hgwdev cd /cluster/data/triCas2 for file in \ beetle.masked.fa \ downloads/beetle.fa \ bed/rmsk/rmsk.out \ bed/simpleRepeat/trf.bed; do sed 's/ChLG1=X/ChLG1X/' $file > tmp; mv tmp $file done hgsql triCas2 -e 'drop table chromInfo' faToTwoBit beetle.masked.fa triCas2.2bit hgsql triCas2 < ~/kent/src/hg/lib/chromInfo.sql twoBitInfo triCas2.2bit /dev/stdout | awk '{printf("%s\t%s\t/gbdb/triCas2/triCas2.2bit\n", $1, $2)}' > chrom.sizes echo "load data local infile 'chrom.sizes' into table chromInfo;" | hgsql triCas2 cd bed/rmsk/ hgsql triCas2 -e 'drop table rmsk' hgLoadOut triCas2 rmsk.out hgsql triCas2 -e 'rename table rmsk_rmsk to rmsk' hgsql triCas2 -e 'drop index bin on rmsk; \ drop index genoStart on rmsk; \ drop index genoEnd on rmsk; \ create index bin on rmsk (genoName(11), bin); \ create index genoStart on rmsk (genoName(11), genoStart); \ create index genoEnd on rmsk (genoName(11), genoEnd);' cd ../simpleRepeat/ hgLoadBed -sqlTable=/cluster/home/aamp/kent/src/hg/lib/simpleRepeat.sql \ triCas2 simpleRepeat trf.bed # MAKE GC5BASE WIGGLE TRACK (DONE 10/22/2005 Andy) ssh hgwdev mkdir /cluster/data/triCas2/bed/gc5Base cd /cluster/data/triCas2/bed/gc5Base hgGcPercent -wigOut -doGaps -file=stdout -win=5 -verbose=2 triCas2 \ /cluster/data/triCas2 | wigEncode stdin gc5Base.wig gc5Base.wib mkdir /gbdb/triCas2/wib ln -s `pwd`/gc5Base.wib /gbdb/triCas2/wib hgLoadWiggle -pathPrefix=/gbdb/triCas2/wib triCas2 gc5Base gc5Base.wig ## MAKE DOWNLOADABLE FILES (DONE 10/22/2005 Andy) ssh hgwdev cd /cluster/data/triCas2 mkdir zips zip -j zips/allOut.zip rmsk/rmsk.out zip -j zips/allFa.zip beetle.masked.fa # zip -j zips/allTrf.zip bed/simpleRepeat/trfMask.bed zip -j zips/allAgp.zip downloads/beetle.agp ssh hgwdev mkdir -p /usr/local/apache/htdocs/goldenPath/triCas2 cd /usr/local/apache/htdocs/goldenPath/triCas2 mkdir bigZips database # Create README.txt files in bigZips/ and database/ to explain the files. cd bigZips/ cp -p /cluster/data/triCas2/zips/*.zip . md5sum *.zip > md5sum.txt # check permissions chmod 664 * # MAKE 11.OOC FILE FOR BLAT (DONE 10/22/2005 Andy) ssh hgwdev cd /cluster/data/triCas2 blat triCas2.2bit /dev/null /dev/null -tileSize=11 -makeOoc=11.ooc -repMatch=300 # APIMEL2 BLASTZ (DONE 10/23/2005 Andy) ssh hgwdev cd /SAN mkdir blastz.apiMel2.2005-10-22 ln -s blastz.apiMel2.2005-10-22 blastz.apiMel2 cd blastz.apiMel2/ cp /cluster/data/triCas2/{triCas2.2bit,chrom.sizes} . cat << "_EOF_" > DEF # T. castaneum vs. A. mellifer export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/h ome/angie/schwartzbin:/parasol/bin BASE=/san/sanVol1/scratch/triCas2/blastz.apiMel2.2005-10-22 BLASTZ=blastz.v7.x86_64 BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/san/sanVol1/scratch/triCas2/blastz.apiMel2.2005-10-22/HoxD55.q BLASTZ_ABRIDGE_REPEATS=0 # TARGET - T. castaneum SEQ1_DIR=$BASE/triCas2.2bit SEQ1_CHUNK=5000000 SEQ1_LAP=10000 SEQ1_LEN=$BASE/chrom.sizes SEQ1_LIMIT=30 # QUERY - A. mellifer SEQ2_DIR=/scratch/hg/andy/apiMel2/nib SEQ2_CHUNK=2000000000 SEQ2_LAP=0 SEQ2_LEN=/scratch/hg/andy/apiMel2/chrom.sizes TMPDIR=/scratch/tmp _EOF_ doBlastzChainNet.sh -bigClusterHub pk -smallClusterHub pk DEF >& run.log & # monitor and continue where it breaks ./cleanUp.csh cd ../ cp -r blastz.apiMel2.2005-10-22/ /cluster/data/triCas2/bed/ # DM2 BLASTZ (DONE 10/23/2005 Andy) ssh hgwdev cd /SAN mkdir blastz.dm2.2005-10-23 cd blastz.dm2.2005-10-23/ cp /cluster/data/triCas2/{triCas2.2bit,chrom.sizes} . cp -r /cluster/data/dm2/nib/ dm2.nib mv chrom.sizes triCas2.sizes cat << "_EOF_" > DEF # T. castaneum vs. D. melanogaster export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/h ome/angie/schwartzbin:/parasol/bin BASE=/san/sanVol1/scratch/triCas2/blastz.dm2.2005-10-23 BLASTZ=blastz.v7.x86_64 BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=$BASE/HoxD55.q BLASTZ_ABRIDGE_REPEATS=0 # TARGET - T. castaneum SEQ1_DIR=$BASE/triCas2.2bit SEQ1_CHUNK=2000000 SEQ1_LAP=10000 SEQ1_LEN=$BASE/triCas2.sizes SEQ1_LIMIT=15 # QUERY - D. melanogaster SEQ2_DIR=$BASE/dm2/nib SEQ2_CHUNK=2000000000 SEQ2_LAP=0 SEQ2_LEN=$BASE/dm2.sizes TMPDIR=/scratch/tmp _EOF_ screen doBlastzChainNet.pl -bigClusterHub pk -smallClusterHub pk DEF >& run.log & screen -d ## PROBLEMS! ## During the chainRun step, 24/171 failed 4 times # ssh pk # cd /SAN/blastz.dm2.2005-10-23/axtChain/run # para check ##171 jobs in batch ##37257 jobs (including everybody's) in Parasol queue. ##Checking finished jobs ##crashed: 24 ##ranOk: 147 ##failed 4 times: 24 ##total jobs in batch: 171 ## OK... this happens because the input in pslParts/ for these 24 jobs is ## all comments. Double-check this: # for f in `cat problems`; do zcat ../../pslParts/$f.psl.gz | grep -v "^#"; done ## ... nothing. OK no big deal, just pretend the chainRun.csh script finished ## and continue with chainMerge. # MAKE HGCENTRALTEST BLATSERVERS ENTRY (DONE 12/12/2005 Andy) ssh hgwdev hgsql -N hgcentraltest -e 'insert into blatServers values("triCas2", "blat19", "17784", 1, 0); insert into blatServers values("triCas2", "blat19", "17784", 0, 1);' # FLY PROTEINS TBLASTN (DONE 12/14/2005 Andy) ssh hgwdev cd san/triCas2/ mkdir tblastn.dm2 cd tblastn.dm2/ # Download x64 NCBI Blast mkdir blast cd blast/ wget ftp://ftp.ncbi.nlm.nih.gov/blast/executables/LATEST/blast-2.2.13-x64-linux.tar.gz tar xfz blast-2.2.13-x64-linux.tar.gz ln -s blast-2.2.13/{bin,data} cd ../ # Make the blast database (target) ssh kkstore01 cd /cluster/data/triCas2 mkdir splits faSplit -oneFile -lift=splits/splits.lft size beetle.masked.fa 10000000 splits/splits cd splits/ faSplit sequence splits.fa 20 beet_ rm splits.fa ssh kolossus cd san/triCas2/tblastn.dm2/splits/ for fa in beet*.fa; do split=${fa%.fa} ../blast/bin/formatdb -i $fa -pF -n $split done rm *.fa formatdb.log # Make queries ssh hgwdev cd san/triCas2/tblastn.dm2/ mkdir flyBaseExons cd flyBaseExons/ # I want about 500 pieces calc `cat /cluster/data/dm2/bed/blat.dm2FB/dm2FB.psl | wc -l`/500 #18929/500 = 37.858000 splitFile /cluster/data/dm2/bed/blat.dm2FB/dm2FB.psl 38 fbExons_ for exonPsl in *; do pslxToFa $exonPsl $exonPsl.fa; rm $exonPsl; done # Set up run ssh pk cd san/triCas2/tblastn.dm2/ ls -1S flyBaseExons/*.fa > exons.lst ls -1S splits/beet*.nsq > targets.lst cp /cluster/data/dm2/bed/blat.dm2FB/protein.lft . cp /san/sanVol1/scratch/andy/tblastnExons . mv splits/splits.lft . mkdir blastOut/ cat << "EOF" > blastGsub #LOOP ./tblastnExons {check in exists $(path1)} {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl} splits.lft protein.lft ./blast/data ./blast/bin/blastall #ENDLOOP EOF chmod +x tblastnExons gensub2 targets.lst exons.lst blastGsub spec para create spec para try para push para time #Completed: 9980 of 9980 jobs #CPU time in finished jobs: 375481s 6258.02m 104.30h 4.35d 0.012 y #IO & Wait Time: 78520s 1308.66m 21.81h 0.91d 0.002 y #Average job time: 45s 0.76m 0.01h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 478s 7.97m 0.13h 0.01d #Submission to last job: 1413s 23.55m 0.39h 0.02d # Chaining part ssh kkstore01 cd /cluster/data/triCas2/bed cp -r /san/sanVol1/scratch/triCas2/tblastn.dm2 . cd tblastn.dm2/blastOut/ for dir in *; do pushd $dir cat *.psl | simpleChain -prot -outPsl -maxGap=25000 /dev/stdin ../c.${dir}.psl popd done mkdir ../simpleChain mv *.psl ../simpleChain/ cd ../simpleChain/ for psl in *; do part=${psl#c.}; part=${part%.psl}; echo $part awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$part.psl > c60.$part.psl sort -rn c60.$part.psl | pslUniq stdin u.$part.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$part.psl > m60.$part.psl done sort -u -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* > ../blastnDm2.psl # Load ssh hgwdev cd /cluster/data/triCas2/bed/tblastn.dm2 hgLoadPsl triCas2 blastDm2FB.psl # HUMAN PROTEINS TBLASTN (Done 12/14/2005 Andy) ssh hgwdev cd san/triCas2/ mkdir tblastn.hg17 cd tblastn.hg17/ ln -s ../../andy/triCas2.tblastn/splits.lft ln -s ../../andy/triCas2.tblastn/splits ln -s ../../andy/blast mkdir humanKgExons cd humanKgExons/ # I want about 1000 pieces calc `cat /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl | wc -l`/ 1000 #18929/500 = 37.365 splitFile /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl 38 kgExons_ for exonPsl in *; do pslxToFa $exonPsl $exonPsl.fa; rm $exonPsl; done # Begin run ssh pk cd san/triCas2/tblastn.hg17/ ls -1S humanKgExons/*.fa > exons.lst ls -1S splits/beet*.nsq > targets.lst cp /cluster/data/hg17/bed/blat.hg17KG/protein.lft . cp /san/sanVol1/scratch/andy/tblastn{Exons,Gsub} . mkdir blastOut/ gensub2 targets.lst exons.lst tblastnGsub spec para create spec para try para push para time #Completed: 19680 of 19680 jobs #CPU time in finished jobs: 1258626s 20977.10m 349.62h 14.57d 0.040 y #IO & Wait Time: 95442s 1590.70m 26.51h 1.10d 0.003 y #Average job time: 69s 1.15m 0.02h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 410s 6.83m 0.11h 0.00d #Submission to last job: 3891s 64.85m 1.08h 0.05d # Chaining ssh kkstore01 cd /cluster/data/triCas2/bed cp -rd /san/sanVol1/scratch/triCas2/tblastn.hg17 . cd tblastn.hg17/blastOut/ cp /san/sanVol1/scratch/andy/triCas2.tblastn/tblastnChain ../ ../tblastnChain blastHg17KG.psl # Loading ssh hgwdev cd /cluster/data/triCas2/bed/tblastn.hg17 hgLoadPsl triCas2 blastHg17KG.psl ########################################################################### # FIX UP AGP, RELOAD GOLD (DONE 10/26/06 angie) # The AGP contains a bunch of "ChLG1=X" lines but the sequence name # is "ChLG1X" -- fix it, reload, and leave the fixed .agp file in a # place where makeDownloads.pl will find it. ssh hgwdev cd /cluster/data/triCas2 sed -e 's/ChLG1=X/ChLG1X/' downloads/nocomments.agp > triCas2.agp hgGoldGapGl -noGl triCas2 triCas2.agp ########################################################################### # REPLACE trfMask.bed (DONE 10/26/06 angie) # I see trfMask.bed mentioned earlier in the file, but it is no longer # anywhere under /cluster/data/triCas2. Regenerating: cd /cluster/data/triCas2/bed/simpleRepeat awk '{if ($5 <= 12) print;}' trf.bed > trfMask.bed ########################################################################### # MAKE DOWNLOADABLE FILES (DONE 10/26/06 angie) ssh kkstore04 makeDownloads.pl triCas2 # Edit README.txt files as directed by script output (add link to # Baylor's project page and RepeatMasker version info).