# for emacs: -*- mode: sh; -*- # Strongylocentrotus purpuratus -- 0.5 assembly April 15, 2005 # # ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Spurpuratus/fasta/ # NOTE: this doc may have genePred loads that fail to include # the bin column. Please correct that for the next build by adding # a bin column when you make any of these tables: # # mysql> SELECT tableName, type FROM trackDb WHERE type LIKE "%Pred%"; # +-------------+---------------------------------+ # | tableName | type | # +-------------+---------------------------------+ # | refGene | genePred refPep refMrna | # | xenoRefGene | genePred xenoRefPep xenoRefMrna | # | genscan | genePred genscanPep | # +-------------+---------------------------------+ # DOWNLOAD SEQUENCE (DONE 4/4/05 Andy) -- REDONE 4/17/2005, see below mkdir -p /cluster/store5/strPur1/downloads ln -s /cluster/store5/strPur1 /cluster/data/strPur1 cd /cluster/data/strPur1/downloads wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Spurpuratus/fasta/Spur20050323/contigs/Spur20041123-contigs.fa.gz wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Spurpuratus/fasta/Spur20050323/contigs/Spur20041123-reptigs.fa.gz wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Spurpuratus/fasta/Spur20050323/contigs/Spur20050323.agp wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Spurpuratus/fasta/Spur20050323/contigs/accession.contig.txt gunzip *.gz sed 's/^\(\w\+\)\b\W\+Spur_1.1_\(\w\+\).*/\1\t\1|\2/' accession.contig.txt > fromAccs.map sed 's/^\(\w\+\)\b\W\+Spur_1.1_\(\w\+\).*/\2\t\1|\2/' accession.contig.txt > fromContigs.map # agpChangeNames and faChangeNames are uncommitted utilities at this point. They can maybe go in oneShot # subs was too slow. sed 's/\.1//' Spur20050323.agp | \ agpChangeNames /dev/stdin fromAccs.map /dev/stdout | sort -k1,1 -k2,2n > urchin.agp rm Spur20050323.agp faChangeNames Spur20041123-contigs.fa fromContigs.map urchinContigs.fa faChangeNames Spur20041123-reptigs.fa fromContigs.map reptigs.fa cat urchinReptigs.fa >> urchinContigs.fa rm *.map Spur20041123-*.fa agpAllToFaFile urchin.agp urchinContigs.fa urchinSupertigs.fa rm urchinContigs.fa urchinReptigs.fa # UPDATED 4/17/2005 # OK I've notived that there's been an update. I'm not making the fancier named cd /cluster/data/strPur1/downloads wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Spurpuratus/fasta/Spur20050323/contigs/Spur20041123-contigs.fa.gz wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Spurpuratus/fasta/Spur20050323/contigs/Spur20041123-reptigs.fa.gz wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Spurpuratus/fasta/Spur20050415/contigs/Spur20050413.agp wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Spurpuratus/fasta/Spur20050415/contigs/accession.contig.txt sed 's/Spur_1.1_//' accession.contig.txt > tmp.map fixCr tmp.map awk '{printf("%s\t%s|%s\n", $1, $1, $2)}' tmp.map > fromAccs.map awk '{printf("%s\t%s|%s\n", $2, $1, $2)}' tmp.map > fromCont.map rm tmp.map sed 's/\.1//' Spur20050413.agp | \ agpChangeNames /dev/stdin fromAccs.map /dev/stdout | sort -k1,1 -k2,2n > urchin.agp faChangeNames Spur20041123-contigs.fa fromCont.map urchinContigs.fa faChangeNames Spur20041123-reptigs.fa fromCont.map reptigs.fa cat reptigs.fa >> urchinContigs.fa rm *.map Spur20041123-*.fa agpAllToFaFile urchin.agp urchinContigs.fa urchinScaffolds.fa rm Spur* urchinContigs.fa reptigs.fa *.map accession.contig.txt # CREATING DATABASE (DONE 4/10/05 Andy) # Create the database. ssh hgwdev hgsql '' -e 'create database strPur1' # LOAD GAP & GOLD TABLES FROM AGP (DONE 4/10/2005 Andy) (REDONE 4/17/2005 Andy) ssh hgwdev cd /cluster/data/strPur1/downloads hgGoldGapGl -noGl strPur1 urchin.agp # For some reason, the indices did not get built correctly -- # "show index from gap/gold" shows NULL cardinalities for chrom. # Rebuild indices with "analyze table". # *** Andy's note: the same thing happened in this assembly too. hgsql strPur1 -e 'analyze table gold; analyze table gap;' # REPEATMASKER (DONE 4/14/05 Andy) (REDONE 4/17/2005 Andy) # Check the library for urchin repeats. Nothing with "urchin", or "purpuratus". /cluster/bluearc/RepeatMasker050112/util/queryRepeatDatabase.pl -species Strongylocentrotus -stat #>IS1#ARTEFACT Length = 768 bp #>IS2#ARTEFACT Length = 1331 bp #>IS3#ARTEFACT Length = 1258 bp #>IS5#ARTEFACT Length = 1195 bp #>IS10#ARTEFACT Length = 1329 bp #>IS30#ARTEFACT Length = 1221 bp #>IS150#ARTEFACT Length = 1443 bp #>IS186#ARTEFACT Length = 1338 bp #>Tn1000#ARTEFACT Length = 5981 bp #>(CA)n#Simple_repeat Length = 180 bp #... #>SPRP2#Unknown Length = 247 bp #>SU7N33#Unknown Length = 195 bp # #Total Sequence Length = 50249 bp # Split files for the run: ssh eieio cd /cluster/data/strPur1 mkdir unmaskedSplits cd unmaskedSplits/ faSplit -outDirDepth=3 -lift=splits.lft gap ../downloads/urchinScaffolds.fa 500000 . # copy to panasas mkdir -p /panasas/store/strPur1 cd /panasas/store/strPur1 cp -r /cluster/data/strPur1/unmaskedSplits . ssh kk cd /cluster/data/strPur1 mkdir RMRun RMOut jkStuff cat << "_EOF_" > jkStuff/RMApis #!/bin/bash animal=Strongylocentrotus genome=strPur1 inloc=/panasas/store/strPur1/unmaskedSplits outloc=/cluster/data/strPur1/RMOut file=`basename $1` dir=`dirname $1` cd $outloc mkdir -p /tmp/$genome/$dir cp $inloc/$1 /tmp/$genome/$dir pushd /tmp/$genome/$dir /cluster/bluearc/RepeatMasker/RepeatMasker -s -spec $animal $file popd mkdir -p $dir cp /tmp/$genome/$dir/$file.out ./$dir/$file.out rm -fr /tmp/$genome/$dir/$file.* rmdir --ignore-fail-on-non-empty /tmp/$genome/$dir rmdir --ignore-fail-on-non-empty /tmp/$genome _EOF_ # << this line makes emacs coloring happy chmod +x jkStuff/RMApis cd unmaskedSplits/ find . -name '*.fa' | sed 's/^\.\///' > ../RMRun/files.lst cd ../RMRun/ cat << "_EOF_" > gsub #LOOP ../jkStuff/RMApis $(path1) {check in line+ /panasas/store/strPur1/unmaskedSplits/$(path1)} {check out line+ ../RMOut/$(dir1)/$(root1).fa.out} #ENDLOOP _EOF_ gensub2 files.lst single gsub RMJobs para create RMJobs para try para push para time #Completed: 187983 of 187983 jobs #CPU time in finished jobs: 1025942s 17099.04m 284.98h 11.87d 0.033 y #IO & Wait Time: 513501s 8558.35m 142.64h 5.94d 0.016 y #Average job time: 8s 0.14m 0.00h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 159s 2.65m 0.04h 0.00d #Submission to last job: 3927s 65.45m 1.09h 0.05d # Lift results. ssh eieio cd /cluster/data/strPur1/RMOut head -n 3 0/0/0/97000.fa.out > header find . -name "*.fa.out" -exec tail +4 '{}' ';' > rmsk.out cat header rmsk.out > all.out rm rmsk.out header liftUp rmsk.lifted.out ../unmaskedSplits/splits.lft warn all.out rm all.out # Load results ssh hgwdev cd /cluster/data/strPur1/RMOut hgLoadOut strPur1 rmsk.lifted.out hgsql strPur1 -e 'rename table rmsk_rmsk to rmsk' hgsql strPur1 -e 'drop index bin on rmsk; \ drop index genoStart on rmsk; \ drop index genoEnd on rmsk; \ create index bin on rmsk (genoName(11), bin); \ create index genoStart on rmsk (genoName(11), genoStart); \ create index genoEnd on rmsk (genoName(11), genoEnd);' # SIMPLE REPEATS (TRF) (DONE 4/15/2005 Andy) (REDONE 4/19/2005 Andy) ssh eieio mkdir -p /cluster/data/strPur1/bed/simpleRepeat cd /panasas/store/strPur1 # Split up stuff mkdir kk9Splits cd kk9Splits/ faSplit sequence /cluster/data/strPur1/downloads/urchinScaffolds.fa 100 urchin_ ssh kk9 cd /cluster/data/strPur1/bed/simpleRepeat cat << "_EOF_" > gsub #LOOP ./trfBig.sh {check in line+ $(path1)} {check out line+ $(root1).bed} -bed -tempDir=/tmp #ENDLOOP _EOF_ cat << "_EOF_" > trfBig.sh #!/bin/bash trfBig $* > /dev/null _EOF_ chmod +x trfBig.sh ls -1 /panasas/store/strPur1/kk9Splits/* > files.lst gensub2 files.lst single gsub spec para create spec para try para push para time #Completed: 99 of 99 jobs #CPU time in finished jobs: 24356s 405.93m 6.77h 0.28d 0.001 y #IO & Wait Time: 1121s 18.69m 0.31h 0.01d 0.000 y #Average job time: 257s 4.29m 0.07h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 1044s 17.40m 0.29h 0.01d #Submission to last job: 1044s 17.40m 0.29h 0.01d ssh eieio cd /cluster/data/strPur1/bed/simpleRepeat cat urch*.bed > trf.bed rm urch*.be # Load this into the database as so ssh hgwdev cd /cluster/data/strPur1/bed/simpleRepeat cat urch*.bed | grep trf > trf.bed rm urch*.bed hgLoadBed -sqlTable=/cluster/home/aamp/kent/src/hg/lib/simpleRepeat.sql \ strPur1 simpleRepeat trf.bed rm -rf /panasas/store/strPur1/kk9Splits # MAKE MASKED 2BIT (DONE 4/15/2005 Andy) # make a filtered version of the trf output: # keep trf's with period <= 12: ssh eieio cd /cluster/data/strPur1/bed/simpleRepeat awk '{if ($5 <= 12) print;}' trf.bed > trfMask.bed cd ../../ mkdir maskedFa cd maskedFa/ maskOutFa -soft ../downloads/urchinScaffolds.fa ../bed/simpleRepeat/trfMask.bed urchin.softMasked.fa maskOutFa -softAdd urchin.softMasked.fa ../RMOut/rmsk.lifted.out urchin.softMasked.fa maskOutFa urchin.softMasked.fa hard urchin.hardMasked.fa faToTwoBit urchin.softMasked.fa strPur1.2bit # faToTwoBit urchin.hardMasked.fa strPur1.hardMasked.2bit mv *.2bit ../ cd ../ # Load into database ssh hgwdev hgsql strPur1 < ~/kent/src/hg/lib/chromInfo.sql twoBitInfo strPur1.2bit /dev/stdout | awk '{printf("%s\t%s\t/gbdb/strPur1/strPur1.2bit\n", $1, $2)}' > chrom.sizes echo "load data local infile 'chrom.sizes' into table chromInfo;" | hgsql strPur1 # CREATING GRP TABLE FOR TRACK GROUPING (DONE 4/15/2005 Andy) # Copy all the data from the table "grp" # in an existing database to the new database ssh hgwdev hgsql strPur1 -e 'create table grp (PRIMARY KEY(NAME)) select * from hg17.grp' # MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE (DONE 4/15/2005 Andy) # Warning: genome and organism fields must correspond # with defaultDb values echo 'INSERT INTO dbDb \ (name, description, nibPath, organism, \ defaultPos, active, orderKey, genome, scientificName, \ htmlPath, hgNearOk, hgPbOk, sourceName) values \ ("strPur1", "April 2005", "/gbdb/strPur1", "S. purpuratus", \ "Scaffold3158:1-12000", 1, 80, \ "S. purpuratus", \ "Strongylocentrotus purpuratus", "/gbdb/strPur1/html/description.html", \ 0, 0, "Baylor");' \ | hgsql -N hgcentraltest echo 'insert into defaultDb (genome, name) values ("S. purpuratus", "strPur1");' \ | hgsql -N hgcentraltest echo 'insert into genomeClade (genome, clade, priority) values ("S. purpuratus", "deuterostome", "30");' \ | hgsql -N hgcentraltest ssh hgwdev cd ~/kent/src/hg/makeDb/trackDb cvs update # Edit trackDb/makefile to add strPur1 to the DBS variable. mkdir -p urchin/strPur1 touch urchin/strPur1/description.html # Create a simple worm/strPur1/description.html file. In the initial case it's just empty. cvs add urchin cd urchin cvs add strPur1 cvs add strPur1/description.html #make update DBS=strPur1 ZOO_DBS= # go public on genome-test cvs ci -m "Added strPur1 (sea urchin)." makefile cvs ci -m "Empty initial trackDb.ra and description.html for strPur1" urchin # in a clean, updated tree's kent/src/hg/makeDb/trackDb: make alpha # SOME GBDB STUFF (DONE 4/15/2005 Andy) ssh hgwdev mkdir -p /gbdb/strPur1 mkdir /cluster/data/strPur1/html ln -s /cluster/data/strPur1/html /gbdb/strPur1/html ln -s /cluster/data/strPur1/strPur1.2bit /gbdb/strPur1/strPur1.2bit # MAKE GC5BASE WIGGLE TRACK (DONE 4/19/2005 Andy) ssh hgwdev mkdir /cluster/data/strPur1/bed/gc5Base cd /cluster/data/strPur1/bed/gc5Base hgGcPercent -wigOut -doGaps -file=stdout -win=5 -verbose=2 strPur1 \ /cluster/data/strPur1 | wigEncode stdin gc5Base.wig gc5Base.wib mkdir /gbdb/strPur1/wib ln -s `pwd`/gc5Base.wib /gbdb/strPur1/wib hgLoadWiggle -pathPrefix=/gbdb/strPur1/wib strPur1 gc5Base gc5Base.wig ## MAKE DOWNLOADABLE FILES (DONE 4/19/2005 Andy) ssh eieio cd /cluster/data/strPur1 mkdir zips zip -j zips/allOut.zip RMOut/rmsk.lifted.out zip -j zips/allFa.zip maskedFa/*.fa zip -j zips/allTrf.zip bed/simpleRepeat/trfMask.bed zip -j zips/allAgp.zip downloads/urchin.agp cd maskedFa/ maskOutFa urchin.softMasked.fa hard urchin.hardMasked.fa cd ../ zip -j zips/allFaMasked.zip maskedFa/urchin.hardMasked.fa ssh hgwdev mkdir -p /usr/local/apache/htdocs/goldenPath/strPur1 cd /usr/local/apache/htdocs/goldenPath/strPur1 mkdir bigZips database # Create README.txt files in bigZips/ and database/ to explain the files. cd bigZips/ cp -p /cluster/data/strPur1/zips/*.zip . md5sum *.zip > md5sum.txt # MAKE 11.OOC FILE FOR BLAT (DONE 4/19/2005 Andy) ssh hgwdev mkdir -p /panasas/store/strPur1 blat /cluster/data/strPur1/strPur1.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=/panasas/store/strPur1/11.ooc -repMatch=300 # PRODUCING GENSCAN PREDICTIONS (DONE 4/19/2005 Andy) ssh eieio # Make hard-masked scaffolds and split up for processing: cd /panasas/store/strPur1 mkdir hardMaskedFaSplits cd hardMaskedFaSplits/ faSplit -outDirDepth=3 -lift=hardMasked.lft gap /cluster/data/strPur1/maskedFa/urchin.hardMasked.fa 1000000 . mkdir /cluster/data/strPur1/bed/genscan # Check out hg3rdParty/genscanlinux to get latest genscan: ssh hgwdev mkdir -p /cluster/data/strPur1/bed/genscan cd /cluster/data/strPur1/bed/genscan cvs co hg3rdParty/genscanlinux # Make 3 subdirectories for genscan to put their output files in mkdir -p /panasas/store/strPur1/genscan find /panasas/store/strPur1/hardMaskedFaSplits -name '*.fa' | \ sed 's/\/panasas\/store\/strPur1\/hardMaskedFaSplits\///'> chunks.lst cat << _EOF_ > genscan.sh #!/bin/bash outdir=\`dirname \$2\` mkdir -p \$outdir gsBig \$* _EOF_ chmod +x genscan.sh cat << _EOF_ > gsub #LOOP ./genscan.sh {check in line+ /panasas/store/strPur1/hardMaskedFaSplits/\$(path1)} {check out line /panasas/store/strPur1/genscan/\$(dir1)/\$(root1).gtf} -trans={check out line /panasas/store/strPur1/genscan/\$(dir1)/\$(root1).pep} -subopt={check out line /panasas/store/strPur1/genscan/\$(dir1)/\$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000 #ENDLOOP _EOF_ # << this line keeps emacs coloring happy gensub2 chunks.lst single gsub jobList ssh kk cd /cluster/data/strPur1/bed/genscan para create jobList para try para check para push para time #Completed: 187943 of 187943 jobs #CPU time in finished jobs: 154777s 2579.62m 42.99h 1.79d 0.005 y #IO & Wait Time: 2009533s 33492.21m 558.20h 23.26d 0.064 y #Average job time: 12s 0.19m 0.00h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 992s 16.53m 0.28h 0.01d #Submission to last job: 5577s 92.95m 1.55h 0.06d # Lift and cat things ssh eieio cd /cluster/data/strPur1/bed/genscan find /panasas/store/strPur1/genscan/ -name "*.gtf" -exec cat '{}' ';' > genscan.gtf find /panasas/store/strPur1/genscan/ -name "*.pep" -exec cat '{}' ';' > genscan.pep find /panasas/store/strPur1/genscan/ -name "*.bed" -exec cat '{}' ';' > genscan.bed liftUp genscan.lifted.gtf /panasas/store/strPur1/hardMaskedFaSplits/hardMasked.lft warn genscan.gtf liftUp genscanSubopt.lifted.bed /panasas/store/strPur1/hardMaskedFaSplits/hardMasked.lft warn genscan.bed # Clean up rm -rf /panasas/store/strPur1/hardMaskedFaSplits # Load into the database: cd /cluster/data/strPur1/bed/genscan ldHgGene -gtf strPur1 genscan genscan.lifted.gtf hgPepPred strPur1 generic genscanPep genscan.pep hgLoadBed strPur1 genscanSubopt genscanSubopt.lifted.bed # GENBANK mRNA AND EST COUNTS (DONE 4/20/2005 Andy) # Go to the latest GenBank full release dir and get an idea of how # many mRNAs and ESTs there are to align. ssh eieio cd /cluster/data/genbank/data/processed/genbank.146.0/full grep purpuratus mrna.gbidx | wc -l #302 grep purpuratus est.*gbidx | wc -l #89289 cd /cluster/data/genbank/data/processed/refseq.10/full grep purpuratus mrna.gbidx | wc -l #219 # AUTO UPDATE GENBANK MRNA RUN (DONE 4/23/2005 Andy) ssh hgwdev # Update genbank config and source in CVS: cd ~/kent/src/hg/makeDb/genbank cvs update cd etc/ # Edit genbank.conf strPur1.genome = /iscratch/i/strPur1/strPur1.2bit strPur1.mondoTwoBitParts = 5000 strPur1.lift = no strPur1.refseq.mrna.native.load = yes strPur1.refseq.mrna.xeno.load = yes strPur1.refseq.mrna.xeno.pslReps = -minCover=0.25 -minAli=0.15 -nearTop=0.005 strPur1.genbank.mrna.xeno.load = yes strPur1.genbank.est.native.load = no strPur1.genbank.est.xeno.load = no strPur1.downloadDir = strPur1 strPur1.perChromTables = no cvs commit -m 'strPur1 added to genbank update.' etc/genbank.conf # Edit src/align/gbBlat to add /panasas/store/strPur1/11.ooc cvs commit -m 'Sea urchin added' src/align/gbBlat # Install to /cluster/data/genbank: make install-server ssh `fileServer /cluster/data/genbank/` cd /cluster/data/genbank # This is an -initial run, mRNA only: nice bin/gbAlignStep -srcDb=genbank -type=mrna -initial strPur1 & tail -f [its logfile] # Load results: ssh hgwdev cd /cluster/data/genbank nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad strPur1 & featureBits strPur1 all_mrna #13122660 bases of 835421305 (1.571%) in intersection featureBits strPur1 xenoMrna # Clean up: rm -rf work/initial.strPur1 # HGGATEWAY IMAGE (picture used with permission) (DONE 5/7/2005 Andy) ssh hgwdev export CVSROOT=/projects/hg/cvsroot cvs co browser/images cd browser/images wget -O S_purpuratus.jpg http://biology.fullerton.edu/courses/biol_317/Web/murray/Fall01/Stronglyocentrotus%20purpuratus.JPG mogrify -geometry 200x200 S_purpuratus.jpg cvs add S_purpuratus.jpg cvs commit S_purpuratus.jpg # BLASTZ CIOSAV1 (DONE 5/23/2005 Andy) ssh eieio cd /cluster/data/strPur1/bed mkdir blastz.cioSav1.2005-05-20 cd blastz.cioSav1.2005-05-20/ cat << _EOF_ > DEF export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin ALIGN=blastz-run BLASTZ=blastz BLASTZ_H=2000 BLASTZ_L=6000 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - Sea urchin SEQ1_DIR=/iscratch/i/strPur1/strPur1.2bit SEQ1_CHUNK=5000000 SEQ1_LAP=10000 # QUERY - Sea squirt SEQ2_DIR=/iscratch/i/cioSav1/nib/cioSav1.2bit SEQ2_CHUNK=5000000 SEQ2_LAP=10000 BASE=/cluster/data/strPur1/bed/blastz.cioSav1.2005-05-20 DEF=\$BASE/DEF RAW=\$BASE/raw CDBDIR=\$BASE SEQ1_LEN=/panasas/store/strPur1/S1.len SEQ2_LEN=\$BASE/S2.len _EOF_ # << this line keeps emacs coloring happy sort -rn +1 /cluster/data/strPur1/chrom.sizes | cut -f1-2 > /panasas/store/strPur1/S1.len cp /panasas/store/strPur1/S1.len . twoBitInfo /cluster/data/cioSav1/cioSav1.2bit /dev/stdout | sort -rn +1 > S2.len # establish a screen to control this job screen time /cluster/bin/scripts/doBlastzChainNet.pl `pwd`/DEF > \ blast.run.out 2>&1 & # detach from screen session: Ctrl-a Ctrl-d # to reattach to this screen session: # BLASTZ CI1 (DONE 5/23/2005 Andy) ssh eieio cd /cluster/data/strPur1/bed mkdir blastz.ci1.2005-07-14 ln -s blastz.ci1.2005-07-14 blastz.ci1 cd blastz.ci1/ twoBitInfo /cluster/data/ci1/ci1.2bit /dev/stdout | sort -rn +1 > S2.len cat << "_EOF_" > DEF # D. melanogaster vs. T. castaneum BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q BLASTZ_ABRIDGE_REPEATS=0 BASE=/cluster/data/strPur1/bed/blastz.ci1 # QUERY - Sea urchin SEQ2_DIR=/santest/scratch/strPur1/strPur1.2bit SEQ2_CHUNK=20000000 SEQ2_LAP=10000 SEQ2_LEN=/cluster/data/strPur1/chrom.sizes # TARGET - Sea squirt SEQ1_DIR=/santest/scratch/ci1/nib SEQ1_CHUNK=5000000 SEQ1_LAP=10000 SEQ1_LEN=$BASE/S2.len _EOF_ #<< Emacs screen doBlastzChainNet.pl DEF -blastzOutRoot \ /santest/scratch/bz.strPur1.ci1 >& do.log & screen -d # REDO REPEATMASKING (DONE 8/3/2005 Andy) # Currently the repeat-masking is too low-coverage. ssh hgwdev /cluster/bluearc/RepeatMasker/util/queryRepeatDatabase.pl -species Deuterostomia -stat #Total Sequence Length = 3617796 bp mv RMOut RMOut.bad mv RMRun RMRun.bad mkdir rmsk cd rmsk/ mkdir -p /santest/scratch/rmsk/run /santest/scratch/rmsk/out ln -s /santest/scratch/rmsk/run run ln -s /santest/scratch/rmsk/out out cd run/ cat << "_EOF_" > rmsk.sh #!/bin/bash animal=Deuterostomia genome=strPur1 outloc=`dirname $2` input=`basename $1` tmp=/scratch/${genome}.$$ mkdir $tmp mkdir -p $outloc cd $outloc pushd $tmp cp $1 . /cluster/bluearc/RepeatMasker/RepeatMasker -s -spec $animal $input popd cp $tmp/${input}.out . rm -fr $tmp _EOF_ # << this line makes emacs coloring happy chmod +x rmsk.sh find /panasas/store/strPur1/unmaskedSplits -name "*.fa" > splits.lst cat << "_EOF_" > gsub #LOOP ./rmsk.sh {check in line+ $(path1)} {check out line+ ../out/$(lastDirs1=3)/$(root1).fa.out} #ENDLOOP _EOF_ # Tested out OK, run this when cluster is back online. ~/kent/src/parasol/bin/gensub2 splits.lst single gsub spec ssh kk cd /cluster/data/strPur1/rmsk/run para create spec para push para time # Cat and lift. ssh hgwdev cd /cluster/data/strPur1/rmsk/out # Find one with a header. ls -l 0/0/0 head -n 3 0/0/0/97000.fa.out > header find . -name "*.fa.out" -exec tail +4 '{}' ';' > rest cat header rest > all.out rm header rest liftUp rmsk.lifted.out ../unmaskedSplits/splits.lft warn all.out rm all.out # Load results... before shot: featureBits -or strPur1 rmsk simpleRepeat #90840012 bases of 835421305 (10.874%) in intersection hgLoadOut strPur1 rmsk.lifted.out hgsql strPur1 -e 'rename table rmsk_rmsk to rmsk' hgsql strPur1 -e 'drop index bin on rmsk; \ drop index genoStart on rmsk; \ drop index genoEnd on rmsk; \ create index bin on rmsk (genoName(11), bin); \ create index genoStart on rmsk (genoName(11), genoStart); \ create index genoEnd on rmsk (genoName(11), genoEnd);' # So how did we do? featureBits -or strPur1 rmsk simpleRepeat #94138492 bases of 835421305 (11.268%) in intersection # Well... that sucks too. # Move stuff off santest. ssh hgwdev cd /cluster/data/strPur1/rmsk rm out run cp -r /santest/scratch/strPur1/rmsk/* . rm -rf /santest/scratch/strPur1/rmsk # REPEAT SCOUT DENOVO REPEAT FINDER (DONE 9/9/2005 Andy) ssh kolossus cd /scratch/andy cp /cluster/data/strPur1/downloads/urchinScaffolds.fa . wget http://repeatscout.bioprojects.org/RepeatScout-1.0.0.tar.gz tar xfz RepeatScout-1.0.0.tar.gz cd RepeatScout-1.0.0 make cd ../ screen ./RepeatScout-1.0.0/build_lmer_table -l 15 -sequence urchinScaffolds.fa -freq lmers.txt -v >& rscout.1.log # Control+a+d # Also re-niced this process: nice +10 # Started about 2:10pm # Finished around 3:15pm. cat rscout.1.log # Done allocating headptr # Done building headptr # There are 66717762 l-mers # Done sorting headptr ./RepeatScout-1.0.0/RepeatScout-v1 -seq urchinScaffolds.fa -output rscout.out -freqlmers.txt -l 15 # THIS DOESN'T RUN # Hahaha. Dumb cause of problem: -seq should be -sequence. However, after # exchanging a ton of e-mail with Neil Jones at UCSD, it's become apparent # that -l shouldn't be specified normally. 1.0.0 needed -l, so Neil # put 1.0.1 online. In addition, it's clear that the RepeatScout-v1 program # takes much longer to run than build_lmer_table, so that's becoming a cluster # run now. # # OK I'm basically starting over: ssh hgwdev cd /santest/scratch/strPur1 mkdir -p repeatScout/splits cd repeatScout/splits/ faSplit about /cluster/data/strPur1/downloads/urchinScaffolds.fa 2000000 rs_urch_ cd ../ wget -r ftp://ftp.ncbi.nih.gov/pub/seg/nseg/ cd ftp.ncbi.nih.gov/pub/seg/nseg/ make cp nseg nmerge /cluster/bin/i386 ssh kolossus cd /scratch/andy wget http://repeatscout.bioprojects.org/RepeatScout-1.0.1.tar.gz tar xfz RepeatScout-1.0.1.tar.gz cd RepeatScout-1.0.1/ make cp RepeatScout-v1 build_lmer_table /cluster/bin/x86_64 cp compare-out-to-gff.prl filter-stage-{1,2}.prl /cluster/bin/scripts ssh pk cd /san/sanVol1/scratch/strPur1/repeatScout mkdir run cd run/ mkdir ../outputs/ ../logs ../freqs/ ls -1 ../splits/ > fa.lst cat > rscout.sh << "_EOF_" #!/bin/bash tmp=/scratch/`basename $1 .fa` tmpFreq=$tmp.lmers.txt tmpOut=$tmp.out tmpLog=$tmp.log /cluster/bin/x86_64/build_lmer_table -sequence $1 -freq $tmpFreq -v >& $tmpLog /cluster/bin/x86_64/RepeatScout-v1 -sequence $1 -freq $tmpFreq -output $tmpOut >& $tmpLog.2 cp $tmpFreq $2 cp $tmpOut $3 cp $tmpLog $4 cp $tmpLog.2 $4.2 rm ${tmp}* _EOF_ cat > gsub << "EOF" #LOOP ./rscout.sh ../splits/$(path1) ../freqs/ ../outputs/ ../logs/ #ENDLOOP EOF chmod +x rscout.sh gensub2 single fa.lst gsub spec para create spec para try para push para time #533 jobs in batch #0 jobs (including everybody's) in Parasol queue. #Checking finished jobs #Completed: 533 of 533 jobs #CPU time in finished jobs: 29317s 488.61m 8.14h 0.34d 0.001 y #IO & Wait Time: 2552s 42.54m 0.71h 0.03d 0.000 y #Average job time: 60s 1.00m 0.02h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 88s 1.47m 0.02h 0.00d #Submission to last job: 381s 6.35m 0.11h 0.00d cd ../ R=0 for out in outputs/*; do numR=`grep "R=" $out | wc -l` awk 'BEGIN{FS = "="};{if (/=/) {printf(">R=%d\n", $2 + "'"$R"'")} else print $1}' $out >> all.out R=$(($R + $numR)) done mkdir libSplits faSplit 400 all.out libSplits/all_ mkdir run2 filtered cd run2/ cat > filter1.sh << "EOF" #!/bin/bash input=`basename $1` output=`basename $2` cp $1 /scratch pushd /scratch /cluster/bin/scripts/filter-stage-1.prl < $input > $output popd cp /scratch/$output $2 rm /scratch/$input rm /scratch/$output EOF chmod +x filter1.sh cat > gsub << "EOF" #LOOP ./filter1.sh {check in line+ ../libSplits/$(path1)} {check out line+ ../filtered/$(root1).lib} #ENDLOOP EOF ls -1 ../libSplits/ > lib.lst gensub2 lib.lst single gsub spec para create spec para push para time #384 jobs in batch #0 jobs (including everybody's) in Parasol queue. #Checking finished jobs #Completed: 383 of 384 jobs #Crashed: 1 jobs #CPU time in finished jobs: 1149s 19.15m 0.32h 0.01d 0.000 y #IO & Wait Time: 1448s 24.14m 0.40h 0.02d 0.000 y #Average job time: 7s 0.11m 0.00h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 66s 1.10m 0.02h 0.00d #Submission to last job: 303s 5.05m 0.08h 0.00d # The crashed job actually worked. cd ../ for f in filtered/*; do cat $f >> filtered.lib; done cd ../ mkdir -p rmsk.repeatScout/{run,out} ssh kkstore02 cd /cluster/data/strPur1 cp -r unmaskedSplits/ /san/sanVol1/scratch/strPur1 exit cd rmsk.repeatScout/run/ cat > rmsk.sh << "EOF" #!/bin/bash thelib=/san/sanVol1/scratch/strPur1/repeatScout/filtered.lib fa=`basename $1` pushd /scratch oldDir=`dirs +1` cp $oldDir/$1 . /cluster/bluearc/RepeatMasker/RepeatMasker -lib $thelib $fa mkdir -p $oldDir/`dirname $2` cp $fa.out $oldDir/$2 rm $fa* popd EOF chmod +x rmsk.sh find ../../unmaskedSplits/ -name "*.fa" > fa.lst cat > gsub << "EOF" #LOOP ./rmsk.sh {check in line+ $(path1)} {check out exists ../out/$(lastDirs1=3)/$(file1).out} #ENDLOOP EOF ~/kent/src/parasol/bin/gensub2 fa.lst single gsub spec para create spec para try para push para time #Completed: 187983 of 187983 jobs #CPU time in finished jobs: 8288262s 138137.70m 2302.29h 95.93d 0.263 y #IO & Wait Time: 757852s 12630.87m 210.51h 8.77d 0.024 y #Average job time: 48s 0.80m 0.01h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 4578s 76.30m 1.27h 0.05d #Submission to last job: 50036s 833.93m 13.90h 0.58d ssh hgwdev cd /santest/scratch/strPur1/rmsk.repeatScout/ find out/ -name "*.fa.out" -type f -size +80c -exec tail +4 '{}' ';' > initial.noheader.out head -n 3 out/0/0/0/94000.fa.out > rmsk.header cat rmsk.header initial.noheader.out > initial.out # 100 cat ../repeatScout/filtered.lib | filter-stage-2.prl --cat=initial.out --thresh=100 > 100.lib grep '^>' 100.lib | sed 's/^>\(R=[[:digit:]]\+\).*/\1/' > 100.lst tabGrep 100.lst 10 initial.noheader.out > 100.noheader.out cat rmsk.header 100.noheader.out > 100.out liftUp 100.lifted.out ../unmaskedSplits/splits.lft warn 100.out hgLoadOut -nosplit -table=repeatScout100 strPur1 100.lifted.out # 300 cat ../repeatScout/filtered.lib | filter-stage-2.prl --cat=initial.out --thresh=300 > 300.lib grep '^>' 300.lib | sed 's/^>\(R=[[:digit:]]\+\).*/\1/' > 300.lst tabGrep 300.lst 10 initial.noheader.out > 300.noheader.out cat rmsk.header 300.noheader.out > 300.out liftUp 300.lifted.out ../unmaskedSplits/splits.lft warn 300.out # 1000 cat ../repeatScout/filtered.lib | filter-stage-2.prl --cat=initial.out --thresh=1000 > 1000.lib grep '^>' 1000.lib | sed 's/^>\(R=[[:digit:]]\+\).*/\1/' > 1000.lst tabGrep 1000.lst 10 initial.noheader.out > 1000.noheader.out cat rmsk.header 1000.noheader.out > 1000.out liftUp 1000.lifted.out ../unmaskedSplits/splits.lft warn 1000.out # Yeah ok finally a reduction in the amount of output. It's probably better to filter on score # versus copy #, but that has a bias towards really long repeats, and I'm not sure that's what # is desirable in terms of guessing what's biologically correct. # MAKE SOFT-MASKED FA/2BIT WITH REPEATSCOUT (DONE 9/9/2005 Andy) ssh eieio cd /cluster/data/strPur1 mkdir repeatScout/ cd repeatScout/ cp /san/sanVol1/scratch/strPur1/rmsk.repeatScout/100.lifted.out . cd ../maskedFa/ maskOutFa -soft ../downloads/urchinScaffolds.fa ../bed/simpleRepeat/trfMask.bed urchin.softMasked.fa maskOutFa -softAdd urchin.softMasked.fa ../repeatScout/100.lifted.out urchin.softWithRepeatScout.fa faSize urchin.softWithRepeatScout.fa #1096072416 bases (260724172 N's 835348244 real 595793115 upper 239555129 lower) in 187943 sequences in 1 files #Total size: mean 5831.9 sd 25954.4 min 100 (Scaffold188639) max 962155 (Scaffold1) median 909 #N count: mean 1387.3 sd 13142.5 #U count: mean 3170.1 sd 12856.6 #L count: mean 1274.6 sd 3702.0 # That's like 40% masked... OK that's too much. # PILER DENOVO REPEAT FINDING (STALLED 8/4/2005 Andy) ssh hgwdev cd /cluster/data/strPur1 mkdir -p piler/src cd piler/src/ wget http://www.drive5.com/piler/piler_source.tar.gz wget http://www.drive5.com/muscle/downloads3.52/muscle3.52_src.tar.gz wget http://www.drive5.com/pals/pals_source.tar.gz tar xfz muscle3.52_src.tar.gz tar xfz pals_source.tar.gz tar xfz piler_source.tar.gz mkdir pals piler mv pals*.gz pals mv piler*.gz piler rm muscle*.gz cd muscle3.52_src/ sed 's/pentiumpro/opteron/g' Makefile > Makefile.opteron mv Makefile Makefile.default mv Makefile.opteron Makefile make cp muscle /cluster/bin/x86_64 cd ../pals/ tar xfz pals_source.tar.gz rm pals_source.tar.gz sed 's/pentiumpro/opteron/g' Makefile > Makefile.opteron mv Makefile Makefile.default mv Makefile.opteron Makefile make cp pals /cluster/bin/x86_64 cd ../piler/ tar xfz piler_source.tar.gz rm piler_source.tar.gz sed 's/pentiumpro/opteron/g' Makefile > Makefile.opteron mv Makefile Makefile.default mv Makefile.opteron Makefile make cp piler /cluster/bin/x86_64 # Split up genome into 20 chunks on SAN. ssh kkstore02 cd /cluster/data/strPur1/piler mkdir splits cd splits/ faSplit sequence ../../downloads/urchinScaffolds.fa 20 urch_ ssh pk cd /cluster/data/strPur1/piler mkdir -p /san/sanvol1/scratch/strPur1/piler/splits cp splits/* /san/sanvol1/scratch/strPur1/piler/splits rm -rf splits/ ln -s /san/sanvol1/scratch/strPur1/piler/splits splits mkdir -p /san/sanvol1/scratch/strPur1/piler/palsSelfOut ln -s /san/sanvol1/scratch/strPur1/piler/palsSelfOut mkdir -p /san/sanvol1/scratch/strPur1/piler/palsSelfRun ln -s /san/sanvol1/scratch/strPur1/piler/palsSelfRun cd palsSelfRun/ ls -1 ../splits/ > fa.lst cat > doPalsSelf.sh << "EOF" #!/bin/sh tmp=/scratch/`basename $2` > /cluster/bin/x86_64/pals -self $1 -out $tmp > cp $tmp $2 rm $tmp EOF chmod +x doPalsSelf.sh #<<; cat > gsub << "EOF" #LOOP ./doPalsSelf.sh {check in line+ ../splits/$(path1)} {check out line+ ../palsSelfOut/pals_$(root1).gff} #ENDLOOP EOF # <<; gensub2 fa.lst single gsub spec para create spec para push para time #Completed: 19 of 20 jobs #Crashed: 1 jobs #CPU time in finished jobs: 12863s 214.39m 3.57h 0.15d 0.000 y #IO & Wait Time: 161s 2.68m 0.04h 0.00d 0.000 y #Average job time: 685s 11.42m 0.19h 0.01d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 1308s 21.80m 0.36h 0.02d #Submission to last job: 1356s 22.60m 0.38h 0.02d # The crashing one is a PROBLEM. Andy needs to e-mail Bob Edgar about this. # MAKE HGCENTRALTEST BLATSERVERS ENTRY (DONE 12/12/2005 Andy) ssh hgwdev hgsql -N hgcentraltest -e 'insert into blatServers values("strPur1", "blat19", "17782", 1, 0); insert into blatServers values("strPur1", "blat19", "17783", 0, 1);' # HUMAN PROTEINS TBLASTN (DONE 12/14/2005 Andy) # Split up sea urchin ssh kkstore02 cd urch/ twoBitToFa strPur1.2bit urchin.fa mkdir splits faSplit sequence splits.fa 60 splits/urch_ mkdir /san/sanVol1/scratch/strPur1/tblastn.{hg17KG,splits} cp splits/* /san/sanVol1/scratch/strPur1/tblastn.splits # Create blast databases ssh kolossus cd /san/sanVol1/scratch/strPur1/tblastn.hg17KG awk '{printf("0\t%s\t%s\t%s\t%s\n", $1, $2, $1, $2)}' /cluster/data/strPur1/chrom.sizes > garbage.lft ln -s ../../andy/blast mkdir splits cd splits/ ln -s ../../tblastn.splits/* . for fa in urch*.fa; do split=${fa%.fa}; echo $split ../blast/bin/formatdb -i $fa -pF -n $split done rm *.fa formatdb.log # protein setup mkdir humanKgExons cd humanKgExons/ splitFile /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl 38 kgExons_ for exonPsl in *; do pslxToFa $exonPsl $exonPsl.fa; rm $exonPsl; done # blast run ssh pk cd san/strPur1/tblastn.hg17KG/ ls -1S humanKgExons/*.fa > exons.lst ls -1S splits/urch*.nsq > targets.lst cp /cluster/data/hg17/bed/blat.hg17KG/protein.lft . cp /san/sanVol1/scratch/andy/tblastn{Exons,Gsub} . mkdir blastOut/ gensub2 targets.lst exons.lst tblastnGsub spec para create spec para try para push para time #Completed: 59040 of 59040 jobs #CPU time in finished jobs: 6544747s 109079.12m 1817.99h 75.75d 0.208 y #IO & Wait Time: 186223s 3103.72m 51.73h 2.16d 0.006 y #Average job time: 114s 1.90m 0.03h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 300s 5.00m 0.08h 0.00d #Submission to last job: 20123s 335.38m 5.59h 0.23d # Chaining part cat << "EOF" > tblastnChain #!/bin/bash dir=$1 echo $dir cat blastOut/${dir}/*.psl | simpleChain -prot -outPsl /dev/stdin simpleChain/c.${dir}.psl awk "(\$13 - \$12)/\$11 > 0.6 {print}" simpleChain/c.${dir}.psl > simpleChain/c60.${dir}.psl sort -rn simpleChain/c60.${dir}.psl | pslUniq stdin simpleChain/u.${dir}.psl awk "((\$1 / \$11) ) > 0.60 { print }" simpleChain/c60.${dir}.psl > simpleChain/m60.${dir}.psl EOF chmod +x tblastnChain for d in `cat dirs.lst`; do ./tblastnChain $d done cd simpleChain/ sort -u -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* > ../blastHg17KG.psl # Loading ssh hgwdev cd /cluster/data/strPur1/bed/tblastn.hg17KG hgLoadPsl strPur1 blastHg17KG.psl # HUMAN BLASTZ (DONE 2/9/2006 Andy) ssh hgwdev cd san/strPur1/ mkdir blastz.hg18.2006-02-09 ln -s blastz.hg18.2006-02-09 blastz.hg18 cd blastz.hg18/ cp /cluster/data/strPur1/{strPur1.2bit,chrom.sizes} . cat << "_EOF_" > DEF # Sea urchin vs. Human export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/h ome/angie/schwartzbin:/parasol/bin BASE=/san/sanVol1/scratch/strPur1/blastz.hg18.2006-02-09 BLASTZ=blastz.v7.x86_64 BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=$BASE/HoxD55.q BLASTZ_ABRIDGE_REPEATS=0 # TARGET - Human hg18 SEQ1_DIR=/scratch/hg/hg18/nib SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LEN=/scratch/hg/hg18/chrom.sizes # QUERY - Sea urchin SEQ2_DIR=$BASE/strPur1.2bit SEQ2_CHUNK=5000000 SEQ2_LAP=0 SEQ2_LEN=$BASE/chrom.sizes SEQ2_LIMIT=5000 TMPDIR=/scratch/tmp _EOF_ screen doBlastzChainNet.pl -bigClusterHub pk -smallClusterHub pk DEF >& run.log & screen -d # monitor and continue where it breaks ./cleanUp.csh cd ../ cp -r blastz.apiMel2.2005-10-22/ /cluster/data/triCas2/bed/ # CIONA BLASTZ (DONE 2/9/2006 Andy) ssh hgwdev cd san/strPur1/ mkdir blastz.ci2.2006-02-10 ln -s blastz.ci2.2006-02-10 blastz.ci2 cd blastz.hg18/ cp /cluster/data/strPur1/{strPur1.2bit,chrom.sizes} . cat << "_EOF_" > DEF # Sea urchin vs. Human export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/h ome/angie/schwartzbin:/parasol/bin BASE=/san/sanVol1/scratch/strPur1/blastz.ci2.2006-02-10 BLASTZ=blastz.v7.x86_64 BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=$BASE/HoxD55.q BLASTZ_ABRIDGE_REPEATS=0 # TARGET - Human hg18 SEQ1_DIR=/scratch/hg/hg18/nib SEQ1_CHUNK=10000000 SEQ1_LAP=10000 SEQ1_LEN=/scratch/hg/hg18/chrom.sizes # QUERY - Sea urchin SEQ2_DIR=$BASE/strPur1.2bit SEQ2_CHUNK=5000000 SEQ2_LAP=0 SEQ2_LEN=$BASE/chrom.sizes SEQ2_LIMIT=5000 TMPDIR=/scratch/tmp _EOF_ screen doBlastzChainNet.pl -bigClusterHub pk -smallClusterHub pk DEF >& run.log & screen -d # monitor and continue where it breaks ./cleanUp.csh cd ../ cp -r blastz.apiMel2.2005-10-22/ /cluster/data/triCas2/bed/ # REFSEQ MRNA (TRIED 2/8/2006) # I already have genbank.conf configured. ssh `fileServer /cluster/data/genbank/` cd /cluster/data/genbank nice ./bin/gbAlignStep -srcDb=refseq -type=mrna -orgCat=native -verbose=1 -initial strPur1 & # GENBANK EST ssh kkstore02 cd /cluster/data/genbank screen nice ./bin/gbAlignStep -srcDb=genbank -verbose=1 -initial strPur1 & # QUALITY SCORES (DONE 2/13/06 Andy) ssh kkstore02 mkdir /cluster/data/strPur1/bed/quality cd /cluster/data/strPur1/bed/quality wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Spurpuratus/fasta/Spur20050415/contigs/Spur20041123-contigs.qual.gz wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Spurpuratus/fasta/Spur20050415/contigs/Spur20041123-reptigs.qual.gz sed 's/AAG.*|//' ../../downloads/urchin.agp > urchin.agp zcat Spur20041123-*.gz > all.qual qaToQac all.qual all.qac qacAgpLift urchin.agp all.qac lifted.qac qacToWig -fixed lifted.qac stdout \ | wigEncode stdin strPur1.{wig,wib} ssh hgwdev cd /cluster/data/strPur1/bed/quality mkdir -p /gbdb/strPur1/wib ln -s `pwd`/strPur1.wib /gbdb/strPur1/wib hgLoadWiggle strPur1 quality strPur1.wig # GENSCAN PREDICTIONS AGAIN (DONE 2/27/2006 Andy) # 3 tables are messed up. liftUp didn't give the split hgsql strPur1 -e "select * from genscan" | tail +2 | sort -k2,2 -k4,4n > genscan.txt hgsql strPur1 -e "select * from genscanPep" | tail +2 > genscanPep.txt echo > genscanClean.py << "_EOF_" import sys, string # Deal with first table genscanTable = open(sys.argv[1]) genscanTableNew = sys.argv[1] + ".new" output = open(genscanTableNew, 'w') lines = genscanTable.readlines() counter = 1 scaff = "" hash = {} for line in lines: words = string.split(line.strip(), '\t', 2) if (words[1] != scaff): counter = 1 scaff = words[1] newname = scaff + ".%d" %(counter) hash[words[0]] = newname words[0] = newname counter = counter + 1 output.write(string.join(words,'\t') + '\n') output.close() genscanTable.close() print("Done with main table") # Deal with Pep table pepTable = open(sys.argv[2]) pepTableNew = sys.argv[2] + ".new" lines = pepTable.readlines() output = open(pepTableNew, 'w') for line in lines: words = string.split(line.strip(), '\t') words[0] = hash[words[0]] output.write(string.join(words, '\t') + '\n') output.close() pepTable.close() print("All done") _EOF_ python genscanClean.py genscan.txt genscanPep.txt hgsqldump -d strPur1 genscanPep > genscanPep.sql hgsql strPur1 -e 'drop table genscanPep' hgLoadSqlTab strPur1 genscanPep genscanPep.sql genscanPep.txt.new ldHgGene -predTab strPur1 genscan genscan.txt.new rm genscan*{new,txt,sql} ########################################################################### # HUMAN (hg18) PROTEINS TRACK (DONE braney 2007-01-26 ) ssh kkstore02 bash # if not using bash shell already mkdir /cluster/data/strPur1/blastDb cd /cluster/data/strPur1 faToTwoBit strPur1.2bit temp.fa faSplit sequence temp.fa 500 blastDb/ rm temp.fa cd blastDb for i in *.fa do /cluster/bluearc/blast229/formatdb -i $i -p F done rm *.fa mkdir -p /san/sanvol1/scratch/strPur1/blastDb cd /cluster/data/strPur1/blastDb for i in nhr nin nsq; do echo $i cp *.$i /san/sanvol1/scratch/strPur1/blastDb done mkdir -p /cluster/data/strPur1/bed/tblastn.hg18KG cd /cluster/data/strPur1/bed/tblastn.hg18KG echo /san/sanvol1/scratch/strPur1/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > query.lst wc -l query.lst # 494 query.lst # we want around 50000 jobs calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk "{print \\\$1}"`/\(50000/`wc query.lst | awk "{print \\\$1}"`\) # 36727/(50000/494) = 362.862760 mkdir -p /cluster/bluearc/strPur1/bed/tblastn.hg18KG/kgfa split -l 360 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl /cluster/bluearc/strPur1/bed/tblastn.hg18KG/kgfa/kg ln -s /cluster/bluearc/strPur1/bed/tblastn.hg18KG/kgfa kgfa cd kgfa for i in *; do nice pslxToFa $i $i.fa; rm $i; done cd .. ls -1S kgfa/*.fa > kg.lst mkdir -p /cluster/bluearc/strPur1/bed/tblastn.hg18KG/blastOut ln -s /cluster/bluearc/strPur1/bed/tblastn.hg18KG/blastOut for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done tcsh cd /cluster/data/strPur1/bed/tblastn.hg18KG cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/cluster/bluearc/blast229/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.2 if pslCheck -prot $3.tmp then mv $3.tmp $3 rm -f $f.1 $f.2 $f.3 $f.4 fi exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4 exit 1 '_EOF_' # << happy emacs chmod +x blastSome gensub2 query.lst kg.lst blastGsub blastSpec exit # back to bash ssh pk cd /cluster/data/strPur1/bed/tblastn.hg18KG para create blastSpec # para try, check, push, check etc. para time # Completed: 50882 of 50882 jobs # CPU time in finished jobs: 8964073s 149401.22m 2490.02h 103.75d 0.284 y # IO & Wait Time: 1545368s 25756.13m 429.27h 17.89d 0.049 y # Average job time: 207s 3.44m 0.06h 0.00d # Longest finished job: 983s 16.38m 0.27h 0.01d # Submission to last job: 133787s 2229.78m 37.16h 1.55d ssh kkstore04 cd /cluster/data/strPur1/bed/tblastn.hg18KG mkdir chainRun cd chainRun tcsh cat << '_EOF_' > chainGsub #LOOP chainOne $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainOne (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=75000 stdin /cluster/bluearc/strPur1/bed/tblastn.hg18KG/blastOut/c.`basename $1`.psl) '_EOF_' exit chmod +x chainOne ls -1dS /cluster/bluearc/strPur1/bed/tblastn.hg18KG/blastOut/kg?? > chain.lst gensub2 chain.lst single chainGsub chainSpec # do the cluster run for chaining ssh kk cd /cluster/data/strPur1/bed/tblastn.hg18KG/chainRun para create chainSpec para try, check, push, check etc. # Completed: 103 of 103 jobs # CPU time in finished jobs: 4863s 81.06m 1.35h 0.06d 0.000 y # IO & Wait Time: 88639s 1477.31m 24.62h 1.03d 0.003 y # Average job time: 908s 15.13m 0.25h 0.01d # Longest finished job: 1737s 28.95m 0.48h 0.02d # Submission to last job: 3936s 65.60m 1.09h 0.05d ssh kkstore04 cd /cluster/data/strPur1/bed/tblastn.hg18KG/blastOut bash # if using another shell for i in kg?? do cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/strPur1/bed/tblastn.hg18KG/blastHg18KG.psl cd .. pslCheck blastHg18KG.psl # this is ok. # load table ssh hgwdev cd /cluster/data/strPur1/bed/tblastn.hg18KG hgLoadPsl strPur1 blastHg18KG.psl # check coverage featureBits strPur1 refGene:cds blastHg18KG -enrichment # refGene:cds 0.065%, blastHg18KG 0.898%, both 0.024%, cover 37.31%, enrich 41.56x featureBits strPur1 blastHg18KG blastHg17KG -enrichment # blastHg18KG 0.898%, blastHg17KG 0.842%, both 0.749%, cover 83.46%, enrich 99.18x ssh kkstore04 rm -rf /cluster/data/strPur1/bed/tblastn.hg18KG/blastOut rm -rf /cluster/bluearc/strPur1/bed/tblastn.hg18KG/blastOut #end tblastn ########################################################################## ########################################################################## # CREATE LIFTOVER FROM strPur1 TO strPur2 # STARTED 2008-03-31 kord # strPur1 -> /cluster/store5/strPur1 # kkstore02-10:/export/cluster/store5 1.5T 1.1T 308G 78% /cluster/store5 ssh kkstore02 mkdir /cluster/data/strPur1/bed/blat.strPur2 cd /cluster/data/strPur1/bed/blat.strPur2 nice time doSameSpeciesLiftOver.pl strPur1 strPur2 \ -bigClusterHub pk \ -ooc /cluster/bluearc/strPur1/11.ooc \ -buildDir /cluster/data/strPur1/bed/blat.strPur2 >do.log 2>&1 & ssh pk cd /cluster/data/strPur1/bed/blat.strPur2/run.blat parasol list batches #user run wait done crash pri max batch #kord 396 28560 2904 0 10 -1 /cluster/store5/strPur1/bed/blat.strPur2/run.blat/ # *** All done! # *** Steps were performed in /cluster/data/strPur1/bed/blat.strPur2 # *** Test installation (/gbdb, goldenPath, hgLiftover operation) on hgwdev. # # 1.71user 0.69system 5:50:08elapsed 0%CPU (0avgtext+0avgdata 0maxresident)k # 0inputs+0outputs (9major+29957minor)pagefaults 0swaps # remove the symbolic link to liftOver chains and copy over the file rm ../liftOver/strPur1ToStrPur2.over.chain.gz cp -p strPur1ToStrPur2.over.chain.gz ../liftOver/ # a link in /usr/local/apache/htdocs/goldenPath/strPur2/liftOver # has already been made to this file and md5sum.txt needs to be updated ssh hgwdev cd /usr/local/apache/htdocs/goldenPath/strPur1/liftOver md5sum *.gz > md5sum.txt