#!/bin/sh echo "This is not a script" exit 255 ######### Creating the Intronerator for C. elegans WS120 version # # This is done after the Ce2 browser has been built, using # the sequences from there. # # Linking to sequence and creating NIB files (old style nibs) ssh hgwdev cd /projects/compbio/experiments/worm mkdir idb.120 cd idb.120 mkdir sanger ln -s /cluster/data/ce2/sanger/* sanger mkdir nt4 cd nt4 # program fatont4 is in kent/src/reformat/fatont4 # The *.nt4 files created are machine specific. # You can not use nt4 files on different machines. # ALSO, the index files can not be used on different machines. # If you want to move the Intronerator to a different machine # you basically need to do this processing on that machine cat << '_EOF_' > dna2nt4.sh #!/bin/sh S=/cluster/data/ce2/sangerFa export S DoOne() { if [ ! -f "$2.nt4" ]; then echo "chr$1.fa -> $2.nt4" rm -f $2.nt4 fatont4 ${S}/chr$1.fa $2.nt4 else echo "$2.nt4 already complete" fi } DoOne M m DoOne I i DoOne II ii DoOne III iii DoOne IV iv DoOne X x DoOne V v '_EOF_' chmod +x dna2nt4.sh time ./dna2nt4.sh # create features listings from gff files # binary makec2c is in kent/src/reformat/makec2c ssh hgwdev cd /projects/compbio/experiments/worm/idb.ws120 mkdir features time makec2c sanger features/c2c /dev/null > features/make.c2c.out 2>&1 # real 0m31.317s # user 0m27.290s # sys 0m3.220s # these *.dir files are used by some of the processing utilities # here and in the browser to find these directories # mkdir features/sanger cd features/sanger echo "/projects/compbio/experiments/worm/idb.ws120/cDNA/" > cdna.dir echo "/projects/compbio/experiments/worm/idb.ws120/features/" > feat.dir echo "/projects/compbio/experiments/worm/idb.ws120/nt4/" > nt4.dir echo "/projects/compbio/experiments/worm/idb.ws120/xeno/" > xeno.dir cd ../.. ln -s features/sanger/*.dir . # makepgo is in kent/src/index/makepgo makepgo features/c2c features/ .coo cd sanger time gffgenes ../features/sanger/c2g ../features/sanger/genes.gdf \ > ../features/sanger/gffgenes.out 2>&1 # real 0m36.061s # user 0m31.660s # sys 0m3.660s # Processing CHROMOSOME_I.gff.gz # Read 6479 genes (103864 exons) in CHROMOSOME_I.gff.gz # Processing CHROMOSOME_II.gff.gz # Read 7541 genes (108565 exons) in CHROMOSOME_II.gff.gz # Processing CHROMOSOME_III.gff.gz # Read 5971 genes (93138 exons) in CHROMOSOME_III.gff.gz # Processing CHROMOSOME_IV.gff.gz # Read 7198 genes (106683 exons) in CHROMOSOME_IV.gff.gz # Processing CHROMOSOME_V.gff.gz # Read 10137 genes (139252 exons) in CHROMOSOME_V.gff.gz # Processing CHROMOSOME_X.gff.gz # Read 6141 genes (112575 exons) in CHROMOSOME_X.gff.gz cd .. cp -p features/sanger/c2g features makepgo features/sanger/c2g features/sanger/ .pgo # Writing 6479 entries nameSize 20 into features/sanger/i.pgo # Writing 7541 entries nameSize 14 into features/sanger/ii.pgo # Writing 5971 entries nameSize 13 into features/sanger/iii.pgo # Writing 7198 entries nameSize 13 into features/sanger/iv.pgo # Writing 10137 entries nameSize 13 into features/sanger/v.pgo # Writing 6141 entries nameSize 12 into features/sanger/x.pgo # Writing 0 entries nameSize 0 into features/sanger/m.pgo # indexgl is in kent/src/index/indexgl cd features/sanger indexgl genes.gdf genes.ix # Reading input # Got 43467 offsets 21 nameSize. Sorting... # Writing index file # ixword3 is in kent/src/index/ixword3 cd ../.. ixword3 features/c2c features/c2c.ix # Got 3391 offsets 10 nameSize. Sorting... ixword3 features/c2g features/c2g.ix # Got 43467 offsets 21 nameSize. Sorting... # down load genbank mRNA's # mRNA_title.gb obtained from http://www.ncbi.nlm.nih.gov/entrez/ # with a search on "nucleotide" and a search string of: # mRNA[Title] AND Caenorhabditis Elegans[Organism] # resulting in 254069 matches, downloaded in "GenBank" format # and a pretty significant file: # -rw-rw-r-- 1 hiram 572034420 Mar 2 00:28 mRNA_title.gb # convert the downloaded genbank format into a .fa and a .cdi file: # can build gb2cdi binary in src/reformat/gb2cdi # make sure it gets into your cse home bin MACHTYPE directory # for this kks00 operation # ssh kks00 cd /projects/compbio/experiments/worm/idb.ws120/cDNA gb2cdi mRNA_title.gb allcdna.fa allcdna.cdi > gb2cdi.out 2>&1 tail -4 gb2cdi.out # 96094 by Kohara, 1515 by Martin, 3184 by Stratagene, # 388 Univ Ariz/U Wash, 152880 full cDNAs # 0 male 141394 hermaphrodite 86165 mixed # 32 adult 71547 embryo 80454 varied 70110 L1 5180 L2 3346 L4 0 other # 253941 cDNAs in all # previous counts were: # 96094 by Kohara, 1515 by Martin, 3184 by Stratagene, # 388 Univ Ariz/U Wash, 113712 full cDNAs # 1 male 141467 hermaphrodite 47123 mixed # 33 adult 55646 embryo 57368 varied 70127 L1 5180 L2 3346 L4 0 other # 214773 cDNAs in all # # indexfa is in kent/src/index/indexfa # time indexfa allcdna.fa allcdna.ix >> indexfa.out 2>&1 # real 0m15.549s # user 0m5.660s # sys 0m1.680s # # fixup names from WABA alignments from Ce2 # to access cluster filesystems and projects/compbio, on hgwdev: ssh hgwdev cd /projects/compbio/experiments/worm/idb.ws120 mkdir xeno cd xeno mkdir cbriggsae #!/bin/sh # RESULT=/projects/compbio/experiments/worm/idb.ws120/xeno/cbriggsae/all.st THERE=/cluster/data/ce2/bed/waba/out/ for i in I II III IV V X M do IChr=i case $i in I) IChr=i;; II) IChr=ii;; III) IChr=iii;; IV) IChr=IV;; V) IChr=v;; X) IChr=x;; M) IChr=m;; esac cd ${THERE}/chr${i} ls c*.wab | while read FN do sed -e \ "s#/iscratch/i/worms/Celegans2/unmaskedFa/chr${i}.fa.chr${i}#${IChr}#" ${FN} done done > $RESULT # back on kks00, index that all.st file, and st files # program ixxenost is in kent/src/index/ixxenost ssh kks00 cd /projects/compbio/experiments/worm/idb.ws120 time ixxenost xeno/cbriggsae/all.st xeno/cbriggsae/all.ix # real 4m29.919s # user 3m50.790s # sys 0m14.070s # stToXao is in kent/src/index/stToXao # this one reads the *.dir files to find things, expects to # find those files in current working director time stToXao xeno/cbriggsae/all.st xeno/cbriggsae/ # real 4m37.433s # user 3m51.040s # sys 0m14.650s cd /projects/compbio/experiments/worm/idb.ws120 mkdir ra ea # prepare mRNA alignments cluster job ssh eieio mkdir /cluster/bluearc/ce2/nt4 mkdir /cluster/bluearc/ce2/exonAli cp -p /projects/compbio/experiments/worm/idb.ws120/cDNA/allcdna.* \ /cluster/bluearc/ce2/exonAli cp -p /projects/compbio/experiments/worm/idb.ws120/nt4/*.nt4 \ /cluster/bluearc/ce2/nt4 ssh kkr1u00 mkdir /iscratch/i/worms/Celegans2/nt4 cp -p /cluster/bluearc/ce2/nt4/*.nt4 /iscratch/i/worms/Celegans2/nt4 /cluster/bin/iSync ssh kk mkdir /cluster/data/ce2/bed/exonAli cd /cluster/data/ce2/bed/exonAli # The number 253943 below comes from a count of the number of # records found in allcdna.fa # faCount allcdna.fa > faCount.out # wc faCount.out # 253943 2031544 8308388 faCount.out date | awk ' { for (i = 0; i < 253943; i+=200) { printf "runOne {check out line+ results/%06d.out} %d 200\n", i, i } } ' > jobList echo '#!/bin/sh' > runOne echo '/cluster/bin/i386/exonAli starting $1 \' >> runOne echo ' /cluster/bluearc/ce2/exonAli/allcdna.fa \' >> runOne echo ' /iscratch/i/worms/Celegans2/nt4 $2 $3 > out/$2.out' >> runOne chmod +x runOne para create jobList para try para push para time # Completed: 1270 of 1270 jobs # CPU time in finished jobs: 161830s 2697.16m 44.95h 1.87d 0.005 y # IO & Wait Time: 336112s 5601.87m 93.36h 3.89d 0.011 y # Average job time: 392s 6.53m 0.11h 0.00d # Longest job: 625s 10.42m 0.17h 0.01d # Submission to last job: 725s 12.08m 0.20h 0.01d cat results/*.out > /projects/compbio/experiments/worm/idb/idb.ws120/ea/all.out # refine those alignments (refiAli source is in src/cdnaAli/refineAli) ssh kks00 cd /projects/compbio/experiments/worm/idb.ws120 # this uses the ea/all.out which is from blasting the mRNA's time refiAli ea/all.out cDNA/allcdna nt4 ra/good.txt \ ra/bad.txt ra/cool.txt ra/err.txt 0 1000000 features/sanger/c2g \ > ra/refiAli.out 2>&1 # real 100m22.510s # user 96m34.360s # sys 1m29.580s # binGood is in kent/src/cdnaAli/binGood and it needs the object # in kent/src/cdnaAli/lib/cdnaAli.o binGood ra/good.txt cDNA/good.ali > cDNA/binGood.out 2>&1 # real 0m14.993s # user 0m12.610s # sys 0m1.010s # ixali is in kent/src/index/ixali ixali cDNA/good.ali cDNA/good.ix > cDNA/ixali.out 2>&1 # real 0m4.795s # user 0m4.290s # sys 0m0.410s # ali2alx is in kent/src/cdnaAli/ali2alx ali2alx cDNA/good.ali cDNA/ > cDNA/ali2alx.out 2>&1 # real 0m4.038s # user 0m3.770s # sys 0m0.190s # cdnaOff is in kent/src/cdnaAli/cdnaOff cdnaOff ra/good.txt cDNA/ > cDNA/cdnaOff.out 2>&1 # real 0m12.397s # user 0m11.490s # sys 0m0.570s XXXX mkdir html mkdir obsolete # The ra/good.txt is the input, all the other files on the command # line are created by this command, # also using: features/c2g.ix features/c2c.ix time introns ra/good.txt cDNA/introns.gff cDNA/introns.txt \ obsolete/altintron.txt html/altsplice.raw.html \ cDNA/introns.fa > cDNA/introns.out 2>&1 # Fetch gene name reference list: cd /projects/compbio/experiments/worm/idb.ws120/sanger wget --timestamping \ ftp://ftp.wormbase.org/pub/wormbase/elegans/WS122/GENE_DUMPS/gene_names.txt cd /projects/compbio/experiments/worm/idb.ws120 ./features/gene_names2orf2gene.pl sanger/gene_names.txt > features/orf2gene.txt makeOrf2gene features/orf2gene.txt sanger/orf2gene \ sanger/syn > sanger/makeOrf2gene.out 2>&1 moresyn sanger/syn features/syn features/orf2gene \ features/orfInfo > features/moresyn.out 2>&1 # # Ready to publish # ssh hgwdev mkdir /cluster/store4/worm/WS120 mkdir /cluster/store4/worm/WS120/nt4 mkdir /cluster/store4/worm/WS120/cDNA mkdir /cluster/store4/worm/WS120/features mkdir /cluster/store4/worm/WS120/xeno cd /cluster/store4/worm/WS120/nt4 cp -p /projects/compbio/experiments/worm/idb.ws120/nt4/*.nt4 . mkdir /cluster/store4/worm/WS120/xeno/cbriggsae cd /cluster/store4/worm/WS120/xeno/cbriggsae cp -p /projects/compbio/experiments/worm/idb.ws120/xeno/cbriggsae/* . mkdir /cluster/store4/worm/WS120/features/sanger cd /cluster/store4/worm/WS120/features/sanger cp -p /projects/compbio/experiments/worm/idb.ws120/features/sanger/* . cd /cluster/store4/worm/WS120/features cp -p /projects/compbio/experiments/worm/idb.ws120/features/* . cd /cluster/store4/worm/WS120/cDNA cp -p /projects/compbio/experiments/worm/idb.ws120/cDNA/* . mkdir /gbdb/intronWS120 mkdir /gbdb/intronWS120/nt4 mkdir /gbdb/intronWS120/cDNA mkdir /gbdb/intronWS120/features mkdir /gbdb/intronWS120/xeno ln -s /cluster/store4/worm/WS120/nt4/* /gbdb/intronWS120/nt4 ln -s /cluster/store4/worm/WS120/cDNA/* /gbdb/intronWS120/cDNA ln -s /cluster/store4/worm/WS120/features/* /gbdb/intronWS120/features ln -s /cluster/store4/worm/WS120/xeno/* /gbdb/intronWS120/xeno mkdir /usr/local/apache/htdocs/IntronWS120 cd /usr/local/apache/htdocs/IntronWS120 echo "/gbdb/intronWS120/cDNA/" > cdna.dir echo "/gbdb/intronWS120/features/" > feat.dir echo "/gbdb/intronWS120/nt4/" > nt4.dir echo "/gbdb/intronWS120/xeno/" > xeno.dir