# for emacs: -*- mode: sh; -*- # # RED FLOUR BEETLE!!! # Tribolium castaneum -- 1.0 assembly Jan 20, 2005 # # ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Tcastaneum/ # DOWNLOAD SEQUENCE (DONE 6/9/2005 Andy) ssh hgwdev df -h /cluster/store* #Filesystem Size Used Avail Use% Mounted on #kkstore01-10:/export/cluster/store1 # 1.1T 749G 265G 74% /cluster/store1 #kkstore01-10:/export/cluster/store10 # 1.1T 965G 49G 96% /cluster/store10 #kkstore-10:/export/cluster/store2 # 472G 431G 18G 97% /cluster/store2 #eieio-10:/export/cluster/store3 # 767G 678G 51G 94% /cluster/store3 #eieio-10:/export/cluster/store4 # 514G 460G 28G 95% /cluster/store4 #eieio-10:/export/cluster/store5 # 1.1T 947G 53G 95% /cluster/store5 #kkusr01-10:/export/cluster/store6 # 2.0T 1.6T 293G 85% /cluster/store6 #kkusr01-10:/export/cluster/store7 # 789G 734G 15G 99% /cluster/store7 #kkstore01-10:/export/cluster/store8 # 1.3T 1.2T 54G 96% /cluster/store8 #kkstore01-10:/export/cluster/store9 # 1.3T 1.1T 159G 88% /cluster/store9 # Ok, store9 looks the happiest. mkdir -p /cluster/store9/triCas1/downloads ln -s /cluster/store9/triCas1 /cluster/data/triCas1 cd /cluster/data/triCas1/downloads wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Tcastaneum/contigs/Tcas_1.0.agp wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Tcastaneum/contigs/Tcas_1.0.fa grep W Tcas_1.0.agp | cut -f6 | sort | uniq | wc -l # 11323 grep '^>' Tcas_1.0.fa | wc -l # 11323 # Wow, is this an AGP file I don't have to fix? grep W Tcas_1.0.agp | cut -f1 | sort | uniq | wc -l # 4365 # Well, that reduces the clutter by 60% or something. Not bad. agpAllToFaFile Tcas_1.0.agp Tcas_1.0.fa beetle.fa #Reading Tcas_1.0.agp #Start doesn't match previous end line 15269 of Tcas_1.0.agp # OK that's garbage. I don't really understand why this errored. # I deleted this line from Tcas_1.0.agp manually and rerunning agpAllToFaFile # worked: #Tcas_1.0.agp:15289:Contig7119 1 1820 1 W Contig7119 1 1820 + # Turns out thats because the .agp file also has this line: #Tcas_1.0.agp:13842:Contig7119 1 3018 1 W Reptig1071 1 3018 - # OK I changed line 13842 to: #Tcas_1.0.agp:13842:Reptig1071 1 3018 1 W Reptig1071 1 3018 + agpAllToFaFile Tcas_1.0.agp Tcas_1.0.fa beetle.fa #Reading Tcas_1.0.agp #Reading Tcas_1.0.fa #Writing beetle.fa # Maybe everything's cool now. I don't know. I'll continue when I hear back from Baylor. # BACK IN EFFECT (6/10/2005) mv Tcas_1.0.agp beetle.agp # CREATING DATABASE (DONE 7/6/2005 Andy) # Create the database. ssh hgwdev hgsql '' -e 'create database triCas1' # LOAD GAP & GOLD TABLES FROM AGP (DONE 7/6/2005 Andy) ssh hgwdev cd /cluster/data/triCas1/downloads hgGoldGapGl -noGl triCas1 beetle.agp # For some reason, the indices did not get built correctly -- # "show index from gap/gold" shows NULL cardinalities for chrom. # Rebuild indices with "analyze table". # *** Andy's note: the same thing happened in this assembly too. hgsql triCas1 -e 'analyze table gold; analyze table gap;' # REPEAT MASKER (DONE 7/6/2005 Andy) ssh kkstore01 cd /cluster/data/triCas1/downloads mkdir -p /panasas/store/triCas1/rmskSplits #faSplit about beetle.fa 500000 ../rmskSplits/beet_ faSplit -lift=/panasas/store/triCas1/rmskSplits/rmsk.lft -outDirDepth=1 gap \ beetle.fa 250000 /panasas/store/triCas1/rmskSplits/beet_ # Check the library for apis repeats. /cluster/bluearc/RepeatMasker050112/util/queryRepeatDatabase.pl -species tribolium -stat #Total Sequence Length = 0 bp /cluster/bluearc/RepeatMasker050112/util/queryRepeatDatabase.pl -species beetle -stat #Total Sequence Length = 0 bp /cluster/bluearc/RepeatMasker050112/util/queryRepeatDatabase.pl -species castaneum -stat #Total Sequence Length = 0 bp /cluster/bluearc/RepeatMasker/util/queryRepeatDatabase.pl -species Tenebrionoidea -stat #Total Sequence Length = 52981 bp ssh kk cd /cluster/data/triCas1 mkdir -p rmsk/run rmsk/out cd rmsk/run/ cat << "_EOF_" > rmsk.sh #!/bin/bash splitDir=${2#*RMOut/} splitDir=${splitDir%/*} file=`basename $1` chunk=${file%.fa} cd /cluster/data/triCas1/rmsk/out mkdir -p $splitDir mkdir -p /tmp/triCas1 pushd /tmp/triCas1 /cluster/bluearc/RepeatMasker/RepeatMasker -s -spec Tenebrionoidea $1 popd cp $1.out ./$splitDir/$chunk.out rm -rf $1.* rmdir --ignore-fail-on-non-empty /tmp/triCas1 _EOF_ # <<; chmod +x rmsk.sh cat << "_EOF_" > gsub #LOOP ./rmsk.sh {check in line+ $(path1)} {check out line+ ../out/$(lastDirs1=1)/$(root1).out} #ENDLOOP _EOF_ find /panasas/store/triCas1/rmskSplits -name '*.fa' > fastas.lst ~/kent/src/parasol/bin/gensub2 fastas.lst single gsub spec para create spec para try para push para time #Completed: 4626 of 4626 jobs #CPU time in finished jobs: 85555s 1425.92m 23.77h 0.99d 0.003 y #IO & Wait Time: 142854s 2380.90m 39.68h 1.65d 0.005 y #Average job time: 49s 0.82m 0.01h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 315s 5.25m 0.09h 0.00d #Submission to last job: 1022s 17.03m 0.28h 0.01d ssh hgwdev cd /cluster/data/triCas1/rmsk find out -name "*.out" -exec tail +4 '{}' ';' > rmsk.out rm -rf out/ liftUp rmsk.lifted.out /panasas/store/triCas1/rmskSplits/rmsk.lft warn rmsk.out mv rmsk.lifted.out rmsk.out hgLoadOut triCas1 rmsk.out hgsql triCas1 -e 'rename table rmsk_rmsk to rmsk' hgsql triCas1 -e 'drop index bin on rmsk; \ drop index genoStart on rmsk; \ drop index genoEnd on rmsk; \ create index bin on rmsk (genoName(11), bin); \ create index genoStart on rmsk (genoName(11), genoStart); \ create index genoEnd on rmsk (genoName(11), genoEnd);' # SIMPLE REPEATS (TRF) (DONE 7/6/2005 Andy) ssh kk cd /cluster/data/triCas1 mkdir -p bed/simpleRepeat/out bed/simpleRepeat/run cd bed/simpleRepeat/run/ mkdir /santest/scratch/triCas1/trf/out cat << "_EOF_" > gsub #LOOP ./trfBig.sh {check in line+ $(path1)} {check out line+ /santest/scratch/triCas1/trf/out/$(lastDirs1=1)/$(root1).bed} #ENDLOOP _EOF_ cat << "_EOF_" > trfBig.sh #!/bin/bash outDir=${2%/*} mkdir -p $outDir trfBig $1 $2 -bed -tempDir=/tmp > /dev/null _EOF_ chmod +x trfBig.sh find /panasas/store/triCas1/rmskSplits -name '*.fa' > fastas.lst ~/kent/src/parasol/bin/gensub2 fastas.lst single gsub spec para create spec para try para push para time #Completed: 4626 of 4626 jobs #CPU time in finished jobs: 4462s 74.36m 1.24h 0.05d 0.000 y #IO & Wait Time: 12492s 208.20m 3.47h 0.14d 0.000 y #Average job time: 4s 0.06m 0.00h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 152s 2.53m 0.04h 0.00d #Submission to last job: 175s 2.92m 0.05h 0.00d ssh hgwdev cd /cluster/data/triCas1/bed/simpleRepeat cat urch*.bed > trf.bed rm urch*.be # Load this into the database as so ssh hgwdev cd /cluster/data/strPur1/bed/simpleRepeat cp -r /santest/scratch/triCas1/trf/out . rm -rf /santest/scratch/triCas1/trf find out -name "*.bed" -exec grep "^beet" '{}' ';' > trf.bed liftUp trf.lifted.bed /panasas/store/triCas1/rmskSplits/rmsk.lft warn trf.bed mv trf.lifted.bed trf.bed hgLoadBed -sqlTable=/cluster/home/aamp/kent/src/hg/lib/simpleRepeat.sql \ triCas1 simpleRepeat trf.bed rm -rf out/ # MAKE MASKED 2BIT (DONE 7/6/2005 Andy) # make a filtered version of the trf output keep trf's with period <= 12: ssh kkstore01 cd /cluster/data/triCas1/bed/simpleRepeat awk '{if ($5 <= 12) print;}' trf.bed > trfMask.bed cd ../../ mkdir maskedFa cd maskedFa/ maskOutFa -soft ../downloads/beetle.fa ../bed/simpleRepeat/trfMask.bed beetle.softMasked.fa maskOutFa -softAdd beetle.softMasked.fa ../rmsk/rmsk.out beetle.softMasked.fa maskOutFa beetle.softMasked.fa hard beetle.hardMasked.fa faToTwoBit beetle.softMasked.fa ../triCas1.2bit # Load into database ssh hgwdev cd /cluster/data/triCas1 hgsql triCas1 < ~/kent/src/hg/lib/chromInfo.sql twoBitInfo triCas1.2bit /dev/stdout | awk '{printf("%s\t%s\t/gbdb/triCas1/triCas1.2bit\n", $1, $2)}' > chrom.sizes echo "load data local infile 'chrom.sizes' into table chromInfo;" | hgsql triCas1 # CREATING GRP TABLE FOR TRACK GROUPING (DONE 7/6/2005 Andy) # Copy all the data from the table "grp" in an existing database to the new database ssh hgwdev hgsql triCas1 -e 'create table grp (PRIMARY KEY(NAME)) select * from hg17.grp' # MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE (DONE 7/6/2005 Andy) # Warning: genome and organism fields must correspond # with defaultDb values ssh hgwdev echo 'INSERT INTO dbDb \ (name, description, nibPath, organism, \ defaultPos, active, orderKey, genome, scientificName, \ htmlPath, hgNearOk, hgPbOk, sourceName) values \ ("triCas1", "Jan. 2005", "/gbdb/triCas1", "T. castaneum", \ "Contig605_Contig1746:20000-45000", 1, 58, \ "T. castaneum", \ "Tribolium castaneum", "/gbdb/triCas1/html/description.html", \ 0, 0, "Baylor");' \ | hgsql -N hgcentraltest echo 'insert into defaultDb (genome, name) values ("T. castaneum", "triCas1");' \ | hgsql -N hgcentraltest echo 'insert into genomeClade (genome, clade, priority) values ("T. castaneum", "insect", "100");' \ | hgsql -N hgcentraltest cd ~/kent/src/hg/makeDb/trackDb cvs update # Edit trackDb/makefile to add triCas1 to the DBS variable. mkdir -p beetle/triCas1 touch beetle/triCas1/description.html # Create a simple worm/triCas1/description.html file. In the initial case it's just empty. cvs add beetle cd beetle cvs add triCas1 cvs add triCas1/description.html #make update DBS=triCas1 ZOO_DBS= # go public on genome-test cvs ci -m "Added triCas1 (red flour beetle)." makefile cvs ci -m "Empty initial trackDb.ra and description.html for triCas1" beetle # in a clean, updated tree's kent/src/hg/makeDb/trackDb: make alpha # SOME GBDB STUFF (DONE 7/6/2005 Andy) ssh hgwdev mkdir -p /gbdb/triCas1 mkdir /cluster/data/triCas1/html ln -s /cluster/data/triCas1/html /gbdb/triCas1/html ln -s /cluster/data/triCas1/triCas1.2bit /gbdb/triCas1/triCas1.2bit # MAKE GC5BASE WIGGLE TRACK (DONE 7/6/2005 Andy) ssh hgwdev mkdir /cluster/data/triCas1/bed/gc5Base cd /cluster/data/triCas1/bed/gc5Base hgGcPercent -wigOut -doGaps -file=stdout -win=5 -verbose=2 triCas1 \ /cluster/data/triCas1 | wigEncode stdin gc5Base.wig gc5Base.wib mkdir /gbdb/triCas1/wib ln -s `pwd`/gc5Base.wib /gbdb/triCas1/wib hgLoadWiggle -pathPrefix=/gbdb/triCas1/wib triCas1 gc5Base gc5Base.wig ## MAKE DOWNLOADABLE FILES (DONE 7/6/2005 Andy) ssh kkstore01 cd /cluster/data/triCas1 mkdir zips zip -j zips/allOut.zip rmsk/rmsk.out zip -j zips/allFa.zip maskedFa/beetle.softMasked.fa zip -j zips/allTrf.zip bed/simpleRepeat/trfMask.bed zip -j zips/allAgp.zip downloads/beetle.agp zip -j zips/allFaMasked.zip maskedFa/beetle.hardMasked.fa ssh hgwdev mkdir -p /usr/local/apache/htdocs/goldenPath/triCas1 cd /usr/local/apache/htdocs/goldenPath/triCas1 mkdir bigZips database # Create README.txt files in bigZips/ and database/ to explain the files. cd bigZips/ cp -p /cluster/data/triCas1/zips/*.zip . md5sum *.zip > md5sum.txt # check permissions chmod 664 * # MAKE 11.OOC FILE FOR BLAT (DONE 7/6/2005 Andy) ssh hgwdev mkdir -p /panasas/store/triCas1 blat /cluster/data/triCas1/triCas1.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=/panasas/store/triCas1/11.ooc -repMatch=300 # PRODUCING GENSCAN PREDICTIONS (DONE 7/6/2005 Andy) ssh kkstore01 # Make hard-masked scaffolds and split up for processing: mkdir -p /panasas/store/triCas1/hardMaskedFaSplits cd /panasas/store/triCas1/hardMaskedFaSplits faSplit -outDirDepth=1 -lift=hardMasked.lft gap /cluster/data/triCas1/maskedFa/beetle.hardMasked.fa 400000 . # Check out hg3rdParty/genscanlinux to get latest genscan: ssh hgwdev mkdir -p /cluster/data/triCas1/bed/genscan cd /cluster/data/triCas1/bed/genscan cvs co hg3rdParty/genscanlinux mkdir run cd run/ find /panasas/store/triCas1/hardMaskedFaSplits -name '*.fa' > fastas.lst cat << "_EOF_" > genscan.sh #!/bin/bash basedir=${2%/gtf/*} splitdir=${2#*gtf/} splitdir=${splitdir%/*} mkdir -p $basedir/{pep,bed,gtf}/$splitdir gsBig $* _EOF_ chmod +x genscan.sh mkdir -p /cluster/bluearc/triCas1/genscan/{pep,bed,gtf} cat << "_EOF_" > gsub #LOOP ./genscan.sh {check in line+ $(path1)} {check out line /cluster/bluearc/triCas1/genscan/gtf/$(lastDirs1=1)/$(root1).gtf} -trans={check out line /cluster/bluearc/triCas1/genscan/pep/$(lastDirs1=1)/$(root1).pep} -subopt={check out line /cluster/bluearc/triCas1/genscan/bed/$(lastDirs1=1)/$(root1).bed} -exe=../hg3rdParty/genscanlinux/genscan -par=../hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=500000 #ENDLOOP _EOF_ # << this line keeps emacs coloring happy ~/kent/src/parasol/bin/gensub2 fastas.lst single gsub spec ssh kk cd /cluster/data/triCas1/bed/genscan/run para create spec para try para check para push para time #Completed: 4472 of 4472 jobs #CPU time in finished jobs: 8119s 135.31m 2.26h 0.09d 0.000 y #IO & Wait Time: 12243s 204.05m 3.40h 0.14d 0.000 y #Average job time: 5s 0.08m 0.00h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 40s 0.67m 0.01h 0.00d #Submission to last job: 494s 8.23m 0.14h 0.01d ssh kkstore01 cd /cluster/data/triCas1/bed/genscan find /cluster/bluearc/triCas1/genscan/ -name "*.gtf" -exec cat '{}' ';' > genscan.gtf find /cluster/bluearc/triCas1/genscan/ -name "*.pep" -exec cat '{}' ';' > genscan.pep find /cluster/bluearc/triCas1/genscan/ -name "*.bed" -exec cat '{}' ';' > genscan.bed liftUp genscan.lifted.gtf /panasas/store/triCas1/hardMaskedFaSplits/hardMasked.lft warn genscan.gtf liftUp genscanSubopt.lifted.bed /panasas/store/triCas1/hardMaskedFaSplits/hardMasked.lft warn genscan.bed # Clean up rm -rf /panasas/store/triCas1/hardMaskedFaSplits /cluster/bluearc/triCas1/genscan # Load into the database: cd /cluster/data/triCas1/bed/genscan ldHgGene -gtf triCas1 genscan genscan.lifted.gtf hgPepPred triCas1 generic genscanPep genscan.pep hgLoadBed triCas1 genscanSubopt genscanSubopt.lifted.bed # GENBANK mRNA AND EST COUNTS (DONE 7/9/2005 Andy) # Go to the latest GenBank full release dir and get an idea of how # many mRNAs and ESTs there are to align. ssh eieio cd /cluster/data/genbank/data/processed/genbank.148.0/full awk '$4 == "Tribolium" {print $4 " " $5;}' mrna.gbidx | sort | uniq -c # 102 Tribolium castaneum # 2 Tribolium confusum # 2 Tribolium freemani awk '$4 == "Tribolium" {print $4 " " $5;}' est*.gbidx | sort | uniq -c # 11187 Tribolium castaneum # 67 Tribolium confusum # AUTO UPDATE GENBANK MRNA RUN (DONE 4/23/2005 Andy) ssh hgwdev # Update genbank config and source in CVS: cd ~/kent/src/hg/makeDb/genbank cvs update cd etc/ # Edit genbank.conf triCas1.genome = /cluster/bluearc/triCas1/triCas1.2bit triCas1.mondoTwoBitParts = 1000 triCas1.lift = no triCas1.refseq.mrna.native.load = yes triCas1.refseq.mrna.xeno.load = yes triCas1.refseq.mrna.xeno.pslReps = -minCover=0.25 -minAli=0.15 -minAli=0.60 -nearTop=0.005 triCas1.genbank.mrna.xeno.load = yes triCas1.genbank.est.native.load = no triCas1.genbank.est.xeno.load = no triCas1.downloadDir = triCas1 triCas1.perChromTables = no cvs commit -m 'triCas1 added to genbank update.' etc/genbank.conf # Edit src/align/gbBlat to add /panasas/store/triCas1/11.ooc cvs commit -m 'Sea urchin added' src/align/gbBlat # Install to /cluster/data/genbank: make install-server ssh `fileServer /cluster/data/genbank/` cd /cluster/data/genbank # This is an -initial run, mRNA only: nice bin/gbAlignStep -srcDb=genbank -type=mrna -initial triCas1 & tail -f [its logfile] # Load results: ssh hgwdev cd /cluster/data/genbank nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad triCas1 & featureBits triCas1 all_mrna #13122660 bases of 835421305 (1.571%) in intersection featureBits triCas1 xenoMrna # Clean up: rm -rf work/initial.triCas1 # BLASTZ/CHAIN/NET DM2 (DONE 7/14/05 Andy) ssh kkstore01 cd /cluster/data/triCas1/bed mkdir blastz.dm2.2005-07-14 ln -s blastz.dm2.2005-07-14 blastz.dm2 cd blastz.dm2/ cat << "_EOF_" > DEF # T. castaneum vs. D. melanogaster BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=0 # TARGET - T. castaneum SEQ1_DIR=/panasas/store/triCas1/triCas1.2bit SEQ1_CHUNK=5000000 SEQ1_LAP=10000 SEQ1_LEN=/panasas/store/triCas1/chrom.sizes # QUERY - D. melanogaster SEQ2_DIR=/iscratch/i/dm2/nib SEQ2_CHUNK=10000000 SEQ2_LAP=10000 SEQ2_LEN=/cluster/data/dm2/chrom.sizes BASE=/cluster/data/triCas1/bed/blastz.dm2 _EOF_ # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF \ -blastzOutRoot /panasas/store/triCas1Dm2 >& do.log & tail -f do.log # BLASTZ/CHAIN/NET APIMEL2 (DONE 7/14/05 Andy) ssh kkstore01 cd /cluster/data/triCas1/bed mkdir blastz.apiMel2.2005-07-14 ln -s blastz.apiMel2.2005-07-14 blastz.apiMel2 cd blastz.spiMel2/ cat << "_EOF_" > DEF # T. castaneum vs. D. melanogaster BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=0 # TARGET - T. castaneum SEQ1_DIR=/panasas/store/triCas1/triCas1.2bit SEQ1_CHUNK=5000000 SEQ1_LAP=10000 SEQ1_LEN=/panasas/store/triCas1/chrom.sizes # QUERY - A. mellifer SEQ2_DIR=/iscratch/i/apiMel2/nib SEQ2_CHUNK=5000000 SEQ2_LAP=0 SEQ2_LEN=/cluster/data/apiMel2/chrom.sizes BASE=/cluster/data/triCas1/bed/blastz.apiMel2 _EOF_ # << this line keeps emacs coloring happy doBlastzChainNet.pl DEF \ -blastzOutRoot /panasas/store/triCas1ApiMel2 >& do.log & tail -f do.log # GOT PROBLEMS. Too many alignments. Netting fails (out of memory): #chainPreNet triCas1.apiMel2.all.chain.gz /panasas/store/triCas1/chrom.sizes /cluster/data/apiMel2/chrom.sizes stdout #Got 4366 chroms in /panasas/store/triCas1/chrom.sizes, 17 in /cluster/data/apiMel2/chrom.sizes #Out of memory needMem - request size 28 bytes # #memory usage 10211328, utime 0 s/100, stime 0 # #gzip: stdout: Broken pipe #Command failed: #ssh -x kolossus nice /cluster/data/triCas1/bed/blastz.apiMel2/axtChain/netChains.csh # The pslParts/ subdir has 4.4GB worth of data, which 50-100x more than I was expecting. # I guess I'll try changing the DEF file's SEQ2_LAP to 10000. ssh hgwdev cd /cluster/data/triCas1/bed mkdir blastz.apiMel2.2005-07-18 rm blastz.apiMel2 ln -s blastz.apiMel2.2005-07-18 blastz.apiMel2 cd blastz.apiMel2/ cat << "_EOF_" > DEF # T. castaneum vs. A. mellifer BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=0 # TARGET - T. castaneum SEQ1_DIR=/panasas/store/triCas1/triCas1.2bit SEQ1_CHUNK=5000000 SEQ1_LAP=10000 SEQ1_LEN=/panasas/store/triCas1/chrom.sizes # QUERY - A. mellifer SEQ2_DIR=/iscratch/i/apiMel2/nib SEQ2_CHUNK=1000000 SEQ2_LAP=10000 SEQ2_LEN=/cluster/data/apiMel2/chrom.sizes BASE=/cluster/data/triCas1/bed/blastz.apiMel2 _EOF_ # << this line keeps emacs coloring happy screen doBlastzChainNet.pl DEF \ -blastzOutRoot /panasas/store/triCas1ApiMel2 >& do.log & screen -d # REPEAT REPEATMASKER (IN PROGROSS WAITING FOR CLUSTER 7/27/2005 Andy) # Problems like with sea urchin. Retry with a broader species library ssh hgwdev /cluster/bluearc/RepeatMasker/util/queryRepeatDatabase.pl -stat -species Endopterygota # Total Sequence Length = 1556344 bp cd /cluster/data/triCas1 mv rmsk rmsk.bad mkdir rmsk mkdir -p /santest/scratch/triCas1/rmsk/{out,run} ln -s /santest/scratch/triCas1/rmsk/out rmsk/out ln -s /santest/scratch/triCas1/rmsk/run rmsk/run cd rmsk/run/ cat << "_EOF_" > rmsk.sh #!/bin/bash splitDir=${2#../out/} splitDir=${splitDir%/*} file=`basename $1` chunk=${file%.fa} cd /cluster/data/triCas1/rmsk/out mkdir -p $splitDir /cluster/bluearc/RepeatMasker/RepeatMasker -s -spec Endopterygota $1 cp $1.out /cluster/data/triCas1/rmsk/out/$splitDir/$chunk.out rm -rf $1.* _EOF_ # <<; chmod +x rmsk.sh cat << "_EOF_" > gsub #LOOP ./rmsk.sh {check in line+ $(path1)} {check out line+ ../out/$(lastDirs1=1)/$(root1).out} #ENDLOOP _EOF_ find /panasas/store/triCas1/rmskSplits -name '*.fa' > fastas.lst ~/kent/src/parasol/bin/gensub2 fastas.lst single gsub spec # **** LEFT HERE ******** # Tested out OK, run this when cluster is back online. ssh pk cd /cluster/data/triCas1/rmsk/run para create spec para try para push para time #Completed: 4626 of 4626 jobs #CPU time in finished jobs: 2744778s 45746.31m 762.44h 31.77d 0.087 y #IO & Wait Time: 20146s 335.76m 5.60h 0.23d 0.001 y #Average job time: 598s 9.96m 0.17h 0.01d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 4904s 81.73m 1.36h 0.06d #Submission to last job: 19309s 321.82m 5.36h 0.22d # Changed some stuff around for pk cluster run. # Cat and lift. ssh hgwdev cd /cluster/data/strPur1/rmsk/out # Find one with a header. ls -l 0 #head -n 3 0/beet.fa.out > header find . -name "*.fa.out" -exec tail +4 '{}' ';' > rest cat header rest > all.out rm header rest liftUp rmsk.lifted.out ../../rmskSplits/rmsk.lft warn all.out rm all.out featureBits -or triCas1 rmsk simpleRepeat #10028249 bases of 150025805 (6.684%) in intersection # Load results hgLoadOut triCas1 rmsk.lifted.out hgsql triCas1 -e 'rename table rmsk_rmsk to rmsk' hgsql triCas1 -e 'drop index bin on rmsk; \ drop index genoStart on rmsk; \ drop index genoEnd on rmsk; \ create index bin on rmsk (genoName(11), bin); \ create index genoStart on rmsk (genoName(11), genoStart); \ create index genoEnd on rmsk (genoName(11), genoEnd);' # So how did we do? featureBits triCas1 -or rmsk simpleRepeat #10456754 bases of 150025805 (6.970%) in intersection # Move stuff off santest. ssh hgwdev cd /cluster/data/triCas1/rmsk rm out run cp -r /santest/scratch/triCas1/rmsk/* . rm -rf /santest/scratch/triCas1/rmsk