# for emacs: -*- mode: sh; -*- # Ciona Intestinalis V1.0 from JGI # DOWNLOAD SEQUENCE ssh eieio mkdir /cluster/store5/squirt cd /cluster/store5/squirt wget ftp://ftp.jgi-psf.org/pub/JGI_data/Ciona/v1.0/ciona.fasta.gz gunzip ciona.fasta # translate to nib ln -s /cluster/store5/squirt/ci1 ~/ci1 cd ~/ci1 mkdir scaffolds faSplit byname ciona.fasta scaffolds/ mkdir nib cd nib foreach i (../scaffolds/*.fa) faToNib $i `basename $i .fa`.nib end # Create the database. ssh hgwdev echo 'create database ci1' | hgsql '' # CREATING GRP TABLE FOR TRACK GROUPING echo "create table grp (PRIMARY KEY(NAME)) select * from rn1.grp" \ | hgsql ci1 echo 'insert into blatServers values("ci1", "blat10", "17780", "1"); \ insert into blatServers values("ci1", "blat10", "17781", "0");' \ | hgsql -h genome-testdb hgcentraltest # STORING O+O SEQUENCE AND ASSEMBLY INFORMATION # Make symbolic links in /gbdb/ci1/nib to the real nibs. ssh hgwdev ln -s /cluster/store5/squirt/ci1/nib /gbdb/ci1/nib # Load /gbdb/ci1/nib paths into database hgsql ci1 < ~kent/src/hg/lib/chromInfo.sql cd ~/ci1 hgNibSeq -preMadeNib ci1 /gbdb/ci1/nib scaffolds/*.fa # make gcPercent table ssh hgwdev mkdir -p ~/ci1/bed/gcPercent cd ~/ci1/bed/gcPercent hgsql ci1 < ~/kent/src/hg/lib/gcPercent.sql hgGcPercent ci1 ../../nib ### AUTO UPDATE GENBANK MRNA RUN # Update genbank config and source in CVS: cd ~/kent/src/hg/makeDb/genbank cvsup . # Edit etc/genbank.conf and add these lines: # ci1 (ciona intestinalis) ci1.genome = /iscratch/i/squirt/ci1/nib/Scaffold*.nib ci1.lift = no ci1.refseq.mrna.native.load = yes ci1.refseq.mrna.xeno.load = yes ci1.refseq.mrna.xeno.pslReps = -minCover=0.25 -minAli=0.15 -nearTop=0.005 ci1.genbank.mrna.xeno.load = yes ci1.genbank.est.xeno.load = no ci1.downloadDir = ci1 ci1.perChromTables = no # ci1 cvs ci etc/genbank.conf make # Install to /cluster/data/genbank: make install-server ssh eieio cd /cluster/data/genbank # This is an -initial run, mRNA only: nice bin/gbAlignStep -clusterRootDir=/cluster/bluearc/genbank \ -iserver=kkr1u00 -iserver=kkr2u00 -iserver=kkr3u00 -iserver=kkr4u00 \ -iserver=kkr5u00 -iserver=kkr6u00 -iserver=kkr7u00 -iserver=kkr8u00 \ -srcDb=genbank -type=mrna -verbose=1 -initial ci1 # Load results: ssh hgwdev cd /cluster/data/genbank nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad ci1 # Clean up: rm -r /cluster/bluearc/genbank/work/initial.ci1 ssh eieio # -initial for ESTs (now with /cluster/store7 and iservers): nice bin/gbAlignStep -clusterRootDir=/cluster/store7/genbank \ -iserver=kkr1u00 -iserver=kkr2u00 -iserver=kkr3u00 -iserver=kkr4u00 \ -iserver=kkr5u00 -iserver=kkr6u00 -iserver=kkr7u00 -iserver=kkr8u00 \ -srcDb=genbank -type=est -verbose=1 -initial ci1 # Load results: ssh hgwdev cd /cluster/data/genbank nice bin/gbDbLoadStep -verbose=1 ci1 # ./bin/gbDbLoadStep -noPerChrom=ci1 -initialLoad ci1 # ./bin/i386/gbLoadRna -drop ci1 # drop too short indices foreach i (all_mrna intronEst all_est xenoMrna) echo "drop index tName on $i ;" | hgsql ci1 echo "drop index tName_2 on $i ;" | hgsql ci1 echo "drop index tName_3 on $i ;" | hgsql ci1 echo "dropped indices on $i" end foreach i (all_mrna intronEst all_est xenoMrna) echo "create index tName on $i (tName(13), bin) ;" echo "create index tName on $i (tName(13), bin) ;" | hgsql ci1 echo "create index tName_2 on $i (tName(13), tStart);" echo "create index tName_2 on $i (tName(13), tStart);" | hgsql ci1 echo "create index tName_3 on $i (tName(13), tEnd);" echo "create index tName_3 on $i (tName(13), tEnd);" | hgsql ci1 echo "created indices on $i" end ### gap and repeats tables ssh hgwdev mkdir -p /cluster/data/ci1/bed/gapRmsk cd /cluster/data/ci1/bed/gapRmsk simpleGap /cluster/data/ci1/nib gap.bed repeats.bed echo "drop table gap;" | hgsql ci1 hgsql ci1 < ~/kent/src/hg/lib/gap.sql hgLoadBed -oldTable ci1 gap gap.bed echo "drop index bin on gap ;" | hgsql ci1 echo "create index chrom on gap (chrom(13), bin) ;" | hgsql ci1 echo "create index chrom_2 on gap (chrom(13), chromStart);" | hgsql ci1 echo "create index chrom_3 on gap (chrom(13), chromEnd);" | hgsql ci1 # do RepeatMasking cd /cluster/data/ci echo "drop index bin on rmsk;" | hgsql ci1 echo "drop index genoStart on rmsk;" | hgsql ci1 echo "drop index genoEnd on rmsk;" | hgsql ci1 echo "create index chrom_2 on rmsk (genoName(13), genoStart);" | hgsql ci1 echo "create index chrom_3 on rmsk (genoName(13), genoEnd);" | hgsql ci1 ssh eieio mkdir -p /cluster/data/ci1/bed/simpleRepeat cd /cluster/data/ci1/bed/simpleRepeat mkdir trf for i in ../../scaffolds/*.fa do trfBig $i /dev/null -bedAt=trf/`basename $i .fa`.bed > /dev/null 2>&1 ; echo $i; done cat trf/* > simpleRepeat.bed ssh hgwdev hgLoadBed ci1 simpleRepeat /cluster/data/ci1/bed/simpleRepeat/simpleRepeat.bed \ -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql ssh eieio cd /cluster/data/ci1/bed/simpleRepeat mkdir -p trfMask cd trf for i in *.bed do awk '{if ($5 <= 12) print;}' $i > ../trfMask/$i done # use RepeatMasker and simpleRepeat to build masked fa's cd /cluster/data/ci1 mkdir maskedScaffolds cd scaffolds for i in *.fa do maskOutFa -soft $i ../bed/RM/out/$i.out ../maskedScaffolds/$i maskOutFa -softAdd ../maskedScaffolds/$i ../bed/simpleRepeat/trfMask/`basename $i .fa`.bed ../maskedScaffolds/$i done # Rebuild the nib files, using the soft masking in the fa: mkdir -p /cluster/data/ci1/nib cd /cluster/data/ci1/nib/maskedScaffolds for i in *.fa do faToNib -softMask $i ../nib/`basename $i .fa`.nib done # Make one big 2bit file as well, and make a link to it in # /gbdb/ci1/nib because hgBlat looks there (or so says galGal2) faToTwoBit *.fa ../ci1.2bit ln -s /cluster/data/ci1/ci1.2bit /gbdb/ci1/nib/ ### JGI ciona genes # the GFF file on the JGI web site is garbage (per Daniel Rokhsar (dsrokhsar@lbl.gov) # build a genPred file from predicted transcripts and predicted proteins. # first blat transcripts against ci1 ssh kk mkdir /cluster/data/ci1/bed/jgiGene cd /cluster/data/ci1/bed/jgiGene wget "ftp://ftp.jgi-psf.org/pub/JGI_data/Ciona/v1.0/ciona.mrna.fasta.gz" gunzip ciona.mrna.fasta.gz cat << '_EOF_' > gsub #LOOP /cluster/bin/i386/blat -mask=lower -q=dna -t=dna {check in exists $(path1)} {check in line+ $(path2)} {check out line+ psl/$(root1)/$(root2).psl} #ENDLOOP '_EOF_' ls -1S /iscratch/i/squirt/ci1/nib/* > squirt.lst mkdir fas faSplit sequence *.fasta 40 fas/ci ls -1S fas/* > mrna.lst gensub2 squirt.lst mrna.lst gsub spec mkdir psl cd psl #!/bin/sh for i in `cat ../squirt.lst` do mkdir `basename $i .nib` done cd .. para create spec para push # Completed: 100040 of 100040 jobs # CPU time in finished jobs: 78739s 1312.32m 21.87h 0.91d 0.002 y # IO & Wait Time: 284574s 4742.89m 79.05h 3.29d 0.009 y # Average job time: 4s 0.06m 0.00h 0.00d # Longest job: 37s 0.62m 0.01h 0.00d # Submission to last job: 1766s 29.43m 0.49h 0.02d ssh eieio pslSort dirs raw.psl /tmp psl/* sed "1,5d" raw.psl | sort -rn | pslUniq stdin mrna.psl # now blat proteins against predicted transcripts to get CDS wget "ftp://ftp.jgi-psf.org/pub/JGI_data/Ciona/v1.0/ciona.prot.fasta.gz" gunzip ciona.prot.fasta.gz blat -q=prot -t=dnax ciona.mrna.fasta ciona.prot.fasta cds.psl sort -rn cds.psl | pslUniq stdin stdout | awk "{printf \"%s\t%s..%s\n\",\$14,\$16+1,\$17}" > jgiGene.cds mrnaToGene -cdsFile=jgiGene.cds mrna.psl jgiGene.gp -genePredExt > log 2>&1 gzip *fasta rm -rf batch* err fas *.tab log psl *.psl para* spec ssh hgwdev cd /cluster/data/ci1/bed/jgiGene ldHgGene ci1 jgiGene jgiGene.gp -predTab -genePredExt hgPepPred ci1 generic jgiGenePep /cluster/data/ci1/bed/jgiGene/ciona.prot.fasta ### SNAP GENE PREDICTIONS FROM COLIN DEWEY (DONE 4/23/04 angie) ssh hgwdev mkdir /cluster/data/ci1/bed/snap cd /cluster/data/ci1/bed/snap # contact: Colin Dewey wget http://hanuman.math.berkeley.edu/~cdewey/tracks/ci1/SNAP.gff.gz gunzip SNAP.gff.gz ldHgGene -gtf -frame -id -geneName ci1 snapGene SNAP.gff ### tBLASTn human proteins ssh kk cd /cluster/data/ci1/bed mkdir tblastnHg16KG cd tblastnHg16KG ls -1S /iscratch/i/squirt/ci1/blastDb/*.nsq | sed "s/.nsq//" > squirt.lst mkdir kgfas cd kgfas split -l 164 /cluster/data/hg16/bed/blat.hg16KG.2004-05-27/uniq.psl kg foreach i (kg*) pslxToFa $i $i.fa end cd .. ssh kkr1u00 mkdir /iscratch/i/squirt/ci1/kgfas cd /cluster/data/ci1/bed/tblastnHg16KG/kgfas cp *.fa /iscratch/i/squirt/ci1/kgfas exit ls -1S /iscratch/i/squirt/ci1/kgfas/*.fa > kg.lst # get blastGsub and blastSome from previous version gensub2 squirt.lst kg.lst blastGsub blastSpec mkdir blastOut cd blastOut foreach i (`cat ../kg.lst`) mkdir `basename $i .fa` end cd .. para create blastSpec para push # Completed: 10040 of 10040 jobs # CPU time in finished jobs: 2489065s 41484.41m 691.41h 28.81d 0.079 y # IO & Wait Time: 214441s 3574.02m 59.57h 2.48d 0.007 y # Average job time: 269s 4.49m 0.07h 0.00d # Longest job: 732s 12.20m 0.20h 0.01d # Submission to last job: 7374s 122.90m 2.05h 0.09d cd blastOut foreach i (kg??) cat $i/q.*.psl > q.$i.psl cat $i/q.*.bscore > q.$i.bscore cat $i/t.*.psl > t.$i.psl cat $i/t.*.bscore > t.$i.bscore scoreSort t.$i.psl t.$i.bscore ts.$i.psl ts.$i.bscore echo $i end cd .. ls -1 blastOut/q.kg??.psl > chain.lst cat << '_EOF_' > chainGsub #LOOP ./chainSome blastOut/$(root1).psl blastOut/$(root1).bscore /cluster/data/ci1/nib /cluster/data/hg16/bed/blat.hg16KG/known.fa /scratch/blast/data/BLOSUM80 blastOut/$(root1).0.psl blastOut/$(root1).0.bscore #ENDLOOP '_EOF_' gensub2 chain.lst single chainGsub chainSpec para create chainSpec ./shove 40 10 # Completed: 251 of 251 jobs # CPU time in finished jobs: 3564s 59.40m 0.99h 0.04d 0.000 y # IO & Wait Time: 5806s 96.77m 1.61h 0.07d 0.000 y # Average job time: 37s 0.62m 0.01h 0.00d # Longest job: 57s 0.95m 0.02h 0.00d # Submission to last job: 1494s 24.90m 0.41h 0.02d cd blastOut cat q.*.0.psl > allQ0.psl cat q.*.0.bscore > allQ0.bscore scoreSort allQ0.psl allQ0.bscore allProts.psl allProts.bscore pslUniq allProts.psl uniqProts.psl sort -n --key=16 uniqProts.psl | sort --key=14 | uniq > bestHg16KG.psl # cp bestHg16KG.psl blastHg16KG.psl protDat bestHg16KG.psl /cluster/data/hg16/bed/blat.*7/hg16KG.psl /cluster/data/hg16/bed/blat.*7/kg.mapNames blastHg16KG.psl hgLoadPsl ci1 blastHg16KG.psl echo "drop index bin on blastHg16KG ;" | hgsql ci1 echo "drop index tStart on blastHg16KG ;" | hgsql ci1 echo "drop index tEnd on blastHg16KG ;" | hgsql ci1 echo "drop index qName on blastHg16KG ;" | hgsql ci1 echo "create index tName on blastHg16KG (tName(13), bin) ;" | hgsql ci1 echo "create index tName_2 on blastHg16KG (tName(13), tStart);" | hgsql ci1 echo "create index tName_3 on blastHg16KG (tName(13), tEnd);" | hgsql ci1 echo "create index qName on blastHg16KG (qName(10));" | hgsql ci1 foreach i ( kg?? ) findExons uniqProts.psl ts.$i.psl x.$i.psl xa.$i.psl end cat x.*.psl > allXA.psl liftUp -type=.psl -nohead -pslQ -isPtoG liftAllXA.psl /cluster/data/hg16/bed/blat.*7/genome.lft warn allXA.psl sort -n --key=16 liftAllXA.psl | sort --key=14 | uniq | sed "s/ci1.//" | pslPosTarget stdin tblastHg16.psl hgLoadPsl ci1 tblastHg16.psl simpleChain tblastHg16.psl chainHg16.chain chainSwap chainHg16.chain chainCi1.chain ./revChainScore chainCi1.chain chainCi1S.chain chainSwap chainCi1S.chain chainHg16S.chain hgLoadChain hg16 chainCi1ProtEx chainCi1S.chain hgLoadChain ci1 chainHg16ProtEx chainHg16S.chain ### blat CioSav1 ssh kk mkdir /cluster/data/ci1/bed/blatCioSav1 cd /cluster/data/ci1/bed/blatCioSav1 cat << '_EOF_' > gsub #LOOP /cluster/bin/i386/blat -mask=lower -q=dnax -t=dnax {check in exists $(path1)} {check in line+ $(path2)} {check out line+ psl/$(root1)/$(root2).psl} #ENDLOOP '_EOF_' ls -1S /scratch/hg/ci1/nib/Scaffold_*.nib > squirt.lst ls -1S /iscratch/i/squirt/ci1/Savignyi/*.fa > other.lst gensub2 squirt.lst other.lst gsub spec mkdir psl cd psl for i in `cat ../squirt.lst` do mkdir `basename $i .nib` done cd .. para create spec para push ssh eieo pslSort dirs blatCioSav1.psl /tmp psl/* hgLoadPsl ci1 blatCioSav1.psl ### end blat # MAKE DOWNLOADABLE SEQUENCE FILES ssh kksilo cd /cluster/data/ci1 #- Build the .zip files cat << '_EOF_' > jkStuff/zipAll.csh rm -rf zip mkdir zip cd bed/RM zip -j ../../zip/ScaffoldOut.zip out/Scaffold*.out cd ../../ zip -j zip/ScaffoldFa.zip scaffolds/*.fa zip -j zip/ScaffoldFaMasked.zip maskedScaffolds/*.fa cd bed/simpleRepeat zip -j ../../zip/ScaffoldTrf.zip trfMask/*.bed cd ../.. cd /cluster/data/genbank ./bin/i386/gbGetSeqs -db=ci1 -native GenBank mrna /cluster/data/ci1/zip/mrna.fa cd /cluster/data/ci1/zip zip -j mrna.zip mrna.fa '_EOF_' # << this line makes emacs coloring happy csh ./jkStuff/zipAll.csh |& tee zipAll.log cd zip #- Look at zipAll.log to make sure all file lists look reasonable. #- Check zip file integrity: foreach f (*.zip) unzip -t $f > $f.test tail -1 $f.test end wc -l *.zip.test #- Copy the .zip files to hgwdev:/usr/local/apache/... ssh hgwdev cd /cluster/data/ci1/zip set gp = /usr/local/apache/htdocs/goldenPath/ci1 mkdir -p $gp/bigZips cp -p *.zip $gp/bigZips # mkdir -p $gp/scaffolds # foreach f ( ../*/chr*.fa ) # zip -j $gp/chromosomes/$f:t.zip $f # end cd $gp/bigZips md5sum *.zip > md5sum.txt # cd $gp/chromosomes # md5sum *.zip > md5sum.txt # Take a look at bigZips/* and chromosomes/*, update their README.txt's # CREATE gc5Base wiggle TRACK (DONE, 2004-04-16, Hiram) # Perform a gc count with a 5 base window. # Also compute a "zoomed" view for display efficiency. # on the file server for this disk intensive operation ssh eieio mkdir /cluster/data/ci1/bed/gc5Base cd /cluster/data/ci1/bed/gc5Base # in the script below, the 'grep -w GC' selects the lines of # output from hgGcPercent that are real data and not just some # information from hgGcPercent. The awk computes the number # of bases that hgGcPercent claimed it measured, which is not # necessarily always 5 if it ran into gaps, and then the division # by 10.0 scales down the numbers from hgGcPercent to the range # [0-100]. Two columns come out of the awk print statement: # and which are fed into wigAsciiToBinary through # the pipe. It is set at a dataSpan of 5 because each value # represents the measurement over five bases beginning with # . The result files end up in ./wigData5. cat << '_EOF_' > runGcPercent.sh #!/bin/sh mkdir -p wigData5 mkdir -p dataLimits5 for n in /cluster/data/ci1/nib/Scaff*.nib do c=`basename ${n} | sed -e "s/.nib//"` C=`echo $c | sed -e "s/Scaffold_//"` echo -n "working on ${c} - ${C} ... " hgGcPercent -chr=${c} -doGaps \ -file=stdout -win=5 ci1 /cluster/data/ci1/nib | grep -w GC | \ awk '{printf "%d\t%.1f\n", $2+1, $5/10.0 }' | \ wigAsciiToBinary \ -dataSpan=5 -chrom=${c} -wibFile=wigData5/gc5Base_${C} \ -name=${C} stdin 2> dataLimits5/${C} echo "done" done '_EOF_' chmod +x runGcPercent.sh time ./runGcPercent.sh # real 11m29.211s # user 9m48.090s # sys 3m41.940s # load the .wig files back on hgwdev: ssh hgwdev cd /cluster/data/ci1/bed/gc5Base hgLoadWiggle -pathPrefix=/gbdb/ci1/wib/gc5Base ci1 gc5Base wigData5/*.wig # and symlink the .wib files into /gbdb mkdir -p /gbdb/ci1/wib/gc5Base # arg list too long to do this simply cd wigData5 ls *.wib | while read FN do ln -s `pwd`/${FN} /gbdb/ci1/wib/gc5Base echo ${FN} done # to speed up display for whole chromosome views, compute a "zoomed" # view and load that on top of the existing table. The savings # comes from the number of data table rows the browser needs to load # for a full chromosome view. Without the zoomed view there are # over 43,000 data rows for chrom 1. With the zoomed view there are # only 222 rows needed for the display. If your original data was # at 1 value per base the savings would be even greater. # Pretty much the same data calculation # situation as above, although this time note the use of the # 'wigZoom -dataSpan=1000 stdin' in the pipeline. This will average # together the data points coming out of the awk print statment over # a span of 1000 bases. Thus each coming out of wigZoom # will represent the measurement of GC in the next 1000 bases. Note # the use of -dataSpan=1000 on the wigAsciiToBinary to account for # this type of data. You want your dataSpan here to be an exact # multiple of your original dataSpan (5*200=1000) and on the order # of at least 1000, doesn't need to go too high. For data that is # originally at 1 base per value, a convenient span is: -dataSpan=1024 # A new set of result files ends up in ./wigData5_1K/*.wi[gb] cat << '_EOF_' > runZoom.sh #!/bin/sh mkdir -p wigData5_1K mkdir -p dataLimits5_1K for n in /cluster/data/ci1/nib/Scaff*.nib do c=`basename ${n} | sed -e "s/.nib//"` C=`echo $c | sed -e "s/Scaffold_//"` echo -n "working on ${c} - ${C} ... " hgGcPercent -chr=${c} -doGaps \ -file=stdout -win=5 ci1 /cluster/data/ci1/nib | grep -w GC | \ awk '{printf "%d\t%.1f\n", $2+1, $5/10.0}' | \ wigZoom -dataSpan=1000 stdin | wigAsciiToBinary \ -dataSpan=1000 -chrom=${c} -wibFile=wigData5_1K/gc5Base_${C}_1K \ -name=${C} stdin 2> dataLimits5_1K/${C} echo "done" done '_EOF_' chmod +x runZoom.sh time ./runZoom.sh # Then load these .wig files into the same database as above ssh hgwdev hgLoadWiggle -pathPrefix=/gbdb/ci1/wib/gc5Base -oldTable ci1 \ gc5Base wigData5_1K/*.wig # and symlink these .wib files into /gbdb mkdir /gbdb/ci1/wib/gc5Base # arg list too long to do this simply cd wigData5_1K ls *.wib | while read FN do ln -s `pwd`/${FN} /gbdb/ci1/wib/gc5Base echo ${FN} done # SWAP CI1-CIOSAV1 BLASTZ ssh kksilo mkdir -p /cluster/data/ci1/bed/blastz.cioSav1.swap.2004-08-03 ln -s /cluster/data/ci1/bed/blastz.cioSav1.swap.2004-08-03 /cluster/data/ci1/bed/blastz.cioSav1 cd /cluster/data/ci1/bed/blastz.cioSav1 set aliDir = /cluster/data/cioSav1/bed/blastz.ci1 cp $aliDir/S1.len S2.len cp $aliDir/S2.len S1.len cat $aliDir/axtChrom/*.axt | axtSwap stdin $aliDir/S1.len $aliDir/S2.len stdout | axtSort stdin all.axt nice axtChain -verbose=0 all.axt /iscratch/i/ci1/nib /iscratch/i/cioSav1/nib all.chain & wait # Load chains into database ssh hgwdev cd /cluster/data/ci1/bed/blastz.cioSav1 hgLoadChain ci1 chainCioSav1 all.chain rm *.tab # NET CIOSAV1 BLASTZ ssh kksilo cd /cluster/data/ci1/bed/blastz.cioSav1 chainPreNet all.chain S1.len S2.len stdout | chainNet stdin -minSpace=1 S1.len S2.len stdout /dev/null \ | netSyntenic stdin noClass.net # Add classification info using db tables: ssh hgwdev cd /cluster/data/ci1/bed/blastz.cioSav1 netClass -noAr noClass.net ci1 cioSav1 cioSav1.net exit # back at kksilo # Make a 'syntenic' subset: rm noClass.net netFilter -syn cioSav1.net > cioSav1Syn.net # Load the nets into database ssh hgwdev cd /cluster/data/ci1/bed/blastz.cioSav1 netFilter -minGap=10 cioSav1.net | hgLoadNet ci1 netCioSav1 stdin netFilter -minGap=10 cioSav1Syn.net | hgLoadNet ci1 netSyntenyCioSav1 stdin # MAKE Human Proteins track (hg17) ssh kksilo mkdir -p /cluster/data/ci1/bed/tblastn.hg17KG cd /cluster/data/ci1/bed/tblastn.hg17KG ls -1S /iscratch/i/ci1/blastDb/*.nsq | sed "s/\.nsq//" > ciona.lst mkdir kgfa split -l 100 /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl kgfa/kg cd kgfa for i in *; do pslxToFa $i $i.fa; rm $i; done cd .. ls -1S kgfa/*.fa > kg.lst mkdir blastOut for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/iscratch/i/blast/data export BLASTMAT f=/tmp/`basename $3` for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg17/bed/blat.hg17KG/protein.lft warn $f.2 if pslCheck -prot $3.tmp then mv $3.tmp $3 rm -f $f.1 $f.2 fi exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 exit 1 '_EOF_' chmod +x blastSome gensub2 ciona.lst kg.lst blastGsub blastSpec ssh kk cd /cluster/data/ci1/bed/tblastn.hg17KG para create blastSpec para push # Completed: 16520 of 16520 jobs # CPU time in finished jobs: 2469697s 41161.62m 686.03h 28.58d 0.078 y # IO & Wait Time: 2404362s 40072.69m 667.88h 27.83d 0.076 y # Average job time: 295s 4.92m 0.08h 0.00d # Longest job: 3067s 51.12m 0.85h 0.04d cat << '_EOF_' > chainGsub #LOOP chainSome $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainSome (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=7500 stdin ../c.`basename $1`.psl) '_EOF_' chmod +x chainSome ls -1dS `pwd`/blastOut/kg?? > chain.lst gensub2 chain.lst single chainGsub chainSpec ssh kki cd /cluster/data/ci1/bed/tblastn.hg17KG para create chainSpec para push # Completed: 413 of 413 jobs # CPU time in finished jobs: 43s 0.72m 0.01h 0.00d 0.000 y # IO & Wait Time: 1752s 29.19m 0.49h 0.02d 0.000 y # Average job time: 4s 0.07m 0.00h 0.00d # Longest job: 12s 0.20m 0.00h 0.00d # Submission to last job: 243s 4.05m 0.07h 0.00d exit # back to eieio cd /cluster/data/ci1/bed/tblastn.hg17KG/blastOut for i in kg?? do awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.psl > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done cat u.*.psl m60* | sort -T /tmp -k 14,14 -k 17,17n -k 17,17n | uniq > ../blastHg17KG.psl cd .. ssh hgwdev cd /cluster/data/ci1/bed/tblastn.hg17KG hgLoadPsl ci1 blastHg17KG.psl echo "drop index tName on blastHg17KG ;" | hgsql ci1 echo "drop index tName_2 on blastHg17KG ;" | hgsql ci1 echo "drop index tName_3 on blastHg17KG ;" | hgsql ci1 echo "create index tName on blastHg17KG (tName(13), bin) ;" | hgsql ci1 echo "create index tName_2 on blastHg17KG (tName(13), tStart);" | hgsql ci1 echo "create index tName_3 on blastHg17KG (tName(13), tEnd);" | hgsql ci1 exit # back to kksilo rm -rf blastOut # End tblastn