# for emacs: -*- mode: sh; -*- # This file describes how to make the browser database for the # worm C. briggsae # Currently 2003-05-29 this file is in a HIGH state of flux as it is being # worked out. # NOTE: this doc may have genePred loads that fail to include # the bin column. Please correct that for the next build by adding # a bin column when you make any of these tables: # # mysql> SELECT tableName, type FROM trackDb WHERE type LIKE "%Pred%"; # +-----------+----------------------+ # | tableName | type | # +-----------+----------------------+ # | twinscan | genePred twinscanPep | # +-----------+----------------------+ DOWNLOAD SEQUENCE (DONE 2003-05-20 - Hiram) # next machine ssh eieio mkdir -p /cluster/store5/worm/cb1/sanger cd /cluster/store5/worm/cb1/sanger wget -o cb1.fetch.log -r -l1 --no-directories \ ftp://ftp.sanger.ac.uk/pub/wormbase/cbriggsae/cb25.agp8/ # Takes about eight minutes # These files seem to have not been updated since July 2002 # Check what you received: ls # cb1.fetch.log cb25.agp8.gff.tar.gz # cb25.agp8.agp.gz cb25.agp8.reads.placed.gz # cb25.agp8.contigs.fasta.gz cb25.agp8.supercontigs.fasta.gz # cb25.agp8.contigs.fasta.qual.gz cb25.agp8.supercontigs.fasta.qual.gz # cb25.agp8.fasta.gz README # get out of this download data directory, and create your home symlink # shortcut cd .. ln -s /cluster/store5/worm/cb1 ~/cb1 cd ~/cb1 # unzip the small contigs and ultra contigs # Will need both of those later zcat sanger/cb25.agp8.contigs.fasta.gz > contigs.fa zcat sanger/cb25.agp8.fasta.gz > ultra.fa # create an artifical chrUn agp file: (does 1000 N gaps) rm -f ultra.agp rm -f ultra.lft scaffoldFaToAgp ultra.fa # gap size is 1000, total gaps: 578 # chrom size is 109025926 # writing ultra.agp # writing ultra.lft mkdir Un mv ultra.agp Un/chrUn.agp mv ultra.lft Un/chrUn.lft #--------------- NEW GAP STUFF ------------------------------ # liftup sanger contig agp file to chrUn coordinates zcat sanger/cb25.agp8.agp.gz > Un/cb25.agp8.agp cd Un # NOTE: silly liftUp requires files to have same name as chrom # NOTE: the generated agp is no longer needed, save it anyway mv chrUn.agp chrUn.ultra.agp liftUp -gapsize=1000 chrUn.agp chrUn.lft warn cb25.agp8.agp mv chrUn.agp chrUn.contig.agp #hgGoldGapGl -noGl cb1 ~/cb1 Un #--------------- END NEW GAP STUFF ------------------------------ # hgGoldGapGl is an unusual command. It is given a directory # argument and a sub directory argument. It puts those two together # and then looks in that pathname for directory names of 1 or 2 # characters, and then looks in those subdirectories for agp files. # it all comes from the structure of the hg* directory structure # and how the human chromosomes are split into their contigs # SO, to handle hgGoldGapGl: mkdir Un/Un cd Un/Un ln -s ../*.agp . # Will put together the chrUn.fa later with the masked contigs # Populate /iscratch/i for masking runs # For some types of operations we want the smaller contigs # For others we want the ultracontigs. Since they are both # relatively small (about 100 M bases) we can have them both # exist out there # next machine ssh kkr1u00 mkdir -p /iscratch/i/worms/Cbriggsae mkdir -p /iscratch/i/worms/Cbriggsae/contigs mkdir -p /iscratch/i/worms/Cbriggsae/ultras cd ~/cb1 cp -p contigs.fa /iscratch/i/worms/Cbriggsae cp -p ultra.fa /iscratch/i/worms/Cbriggsae faSplit sequence contigs.fa 1000 /iscratch/i/worms/Cbriggsae/contigs/c ~hiram/bin/i386/faSplit byname ultra.fa /iscratch/i/worms/Cbriggsae/ultras # I don't see Brian's faSplit byname function in the # /cluster/bin/i386/faSplit yet ? (It's version is 19 July 2002) ~kent/bin/iSync # With /iscratch/i/worms/Cbriggsae populated, ready for RepeatMasker run # next machine ssh kk cd ~/cb1 mkdir -p RMRun.ultras/out cd RMRun.ultras cat << '_EOF_' > RMWorm #!/bin/csh -fe # # $1 is full pathname to a contig .fa file # this directory . is the location to return results to set bname = `basename $1` /bin/mkdir -p /tmp/$bname /bin/cp $1 /tmp/$bname/ pushd . cd /tmp/$bname /scratch/hg/RepeatMasker/RepeatMasker -ali -s -el $bname popd /bin/cp /tmp/$bname/$bname.out ./out if (-e /tmp/$bname/$bname.align) /bin/cp /tmp/$bname/$bname.align ./out # /bin/cp /tmp/$2*.masked ../masked/ rm -f /tmp/$bname/$bname rm -f /tmp/$bname/$bname.* rmdir /tmp/$bname '_EOF_' chmod +x RMWorm cat << '_EOF_' > gsub #LOOP /cluster/store5/worm/cb1/RMRun.ultras/RMWorm {check in exists+ $(path1)} #ENDLOOP '_EOF_' ls /iscratch/i/worms/Cbriggsae/ultras/*.fa > briggsae.lst echo "" > dummy.lst gensub2 briggsae.lst dummy.lst gsub spec para create spec para try # SIMPLE REPEAT [TRF] TRACK (DONE 2003-05-29 - Hiram) # Assuming ultra contings are already in # /iscratch/i/worms/Cbriggsae/ultras from RepeatMasker setup above # Create cluster parasol job: # next machine ssh kk mkdir -p ~/cb1/bed/simpleRepeat cd ~/cb1/bed/simpleRepeat mkdir trf ls -1S /iscratch/i/worms/Cbriggsae/ultras/*.fa > genome.lst cat << '_EOF_' > gsub #LOOP /cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf {check in line+ $(path1)} /dev/null -bedAt={check out line trf/$(root1).bed} -tempDir=/tmp #ENDLOOP '_EOF_' echo "" > dummy.lst gensub2 genome.lst dummy.lst gsub spec para create spec para try para check para push ... etc ... # PROCESS SIMPLE REPEATS INTO MASK (DONE 2003-05-29 - Hiram) # next machine ssh eieio cd ~/cb1/bed/simpleRepeat mkdir -p trfMask foreach f (trf/*.bed) awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t end # When cluster run is done, combine into one: #XXXX cat trf/*.bed > simpleRepeat.bed # Load into the database: # next machine #XXXX ssh hgwdev #XXXX cd ~/cb1/bed/simpleRepeat #XXXX /cluster/bin/i386/hgLoadBed cb1 simpleRepeat simpleRepeat.bed \ #XXXX -sqlTable=$HOME/src/hg/lib/simpleRepeat.sql # Create Soft and Hard masks from RepeatMaster and TRF outputs: # and rebuild the nib files # next machine ssh eieio mkdir ~/cb1/ultras mkdir ~/cb1/ultraNibs cd ~/cb1/ultras ~hiram/bin/i386/faSplit byname ../ultra.fa . mkdir ~/cb1/softMask cd ~/cb1/softMask foreach c (../ultras/*.fa) set b = $c:t set r = $b:r echo ${c} ${b} ${r} /cluster/bin/i386/maskOutFa ${c} ../RMRun.ultras/out/${b}.out \ ${b} -soft /cluster/bin/i386/maskOutFa ${b} \ ../bed/simpleRepeat/trfMask/${r}.bed \ ${b} -softAdd faToNib -softMask ${b} ../ultraNibs/${r}.nib end # combine them all to be the query sequence for blastz runs cat *.fa > ../allUltras.soft.fa # With masked nib files ready, prepare cluster for blastz runs # next machine ssh kkr1u00 cd ~/cb1 mkdir -p /iscratch/i/worms/Cbriggsae/trfFa cp -p allUltras.soft.fa /iscratch/i/worms/Cbriggsae/trfFa mkdir -p /iscratch/i/worms/Cbriggsae/rmsk cd ~/cb1/RRun.ultras/out cp -p *.out /iscratch/i/worms/Cbriggsae/rmsk ~kent/bin/iSync # XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX WORKING ON THIS 2003-06-10 XXXX # Un directory was created above from the scaffoldToAgp # next machine ssh eieio cd ~/cb1/Un /cluster/home/hiram/bin/i386/agpToFa -simpleMultiMixed chrUn.agp \ chrUn chrUn.fa ../allUltras.soft.fa # $ faSize chrUn.fa # 109025926 bases (3588679 N's 105437247 real) in 1 sequences in 1 files # create nib file: cd ~/cb1 mkdir nib /cluster/bin/i386/faToNib -softMask Un/chrUn.fa nib/chrUn.nib # Writing 109025926 bases in 54512971 bytes # CREATING DATABASE (DONE 2003-06-10 - Hiram) # Create the database. # next machine ssh hgwdev echo 'create database cb1' | hgsql '' # if you need to delete that database: !!! WILL DELETE EVERYTHING # !!! echo 'drop database cb1' | hgsql cb1 # Use df to make sure there is at least 5 gig free on df -h /var/lib/mysql # CREATING GRP TABLE FOR TRACK GROUPING (DONE 2003-06-10 - Hiram) # next machine ssh hgwdev # the following command copies all the data from the table # grp in the database rn1 to our new database cb1 echo "create table grp (PRIMARY KEY(NAME)) select * from rn1.grp" \ | hgsql cb1 # if you need to delete that table: !!! WILL DELETE ALL grp data # !!! echo 'drop table grp;' | hgsql cb1 # STORING O+O SEQUENCE AND ASSEMBLY INFORMATION (DONE 2003-05-20 - Hiram) # Make symbolic links from /gbdb/cb1/nib to the real nibs. # next machine # next machine ssh hgwdev mkdir -p /gbdb/cb1/nib ln -s /cluster/store5/worm/cb1/nib/chrUn.nib /gbdb/cb1/nib # Load /gbdb/cb1/nib paths into database and save size info. hgsql cb1 < ~/kent/src/hg/lib/chromInfo.sql # if you need to delete that table: !!! DELETES ALL DATA IN TABLE # !!! echo 'drop table chromInfo;' | hgsql cb1 cd ~/cb1 hgNibSeq -preMadeNib cb1 /gbdb/cb1/nib Un/chrUn.fa # Processing Un/chrUn.fa to /gbdb/cb1/nib/chrUn.nib # 109025926 total bases # Verify the hgNibSeq load functioned OK: echo "select chrom,size from chromInfo" | hgsql -N cb1 > chrom.sizes cat chrom.sizes # Typical contents of chrom.sizes: # chrUn 109025926 # Set up relational mrna tables. hgLoadRna new cb1 # that created a bunch of tables. If you need to delete them: echo 'drop table author; \ drop table cds; drop table cell; drop table description; \ drop table development; drop table extFile; drop table geneName; \ drop table history; drop table keyword; drop table library; drop table mrna; \ drop table mrnaClone; drop table organism; drop table productName; \ drop table seq; drop table sex; drop table source; drop table tissue;' \ | hgsql cb1 # OR - hgLoadRna drop cb1 # MAKE GCPERCENT (DONE 2003-06-10 - Hiram) # next machine ssh hgwdev mkdir -p /cluster/store5/worm/cb1/bed/gcPercent cd /cluster/store5/worm/cb1/bed/gcPercent hgsql cb1 < ~/kent/src/hg/lib/gcPercent.sql # If you need to delete that table created echo 'drop table gcPercent;' | hgsql cb1; hgGcPercent cb1 ../../nib hgGoldGapGl -noGl cb1 /cluster/store5/worm/cb1 Un # hgGoldGapGl looked in /cluster/store5/worm/cb1/Un for a 1 or 2 # character directory name, then below that found the agp file ! # MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE FOR CB1 (DONE 2002-06-10 - Hiram) # next machine ssh hgwdev echo 'insert into defaultDb values("C. briggsae", "cb1");' \ | hgsql -h genome-testdb hgcentraltest # If you need to delete that entry: echo 'delete from defaultDb where name="cb1";' \ | hgsql -h genome-testdb hgcentraltest # Note: for next assembly, set scientificName column to # "Caenorhabditis briggsae" echo 'insert into dbDb values("cb1", "July 2002", \ "/gbdb/cb1/nib", "Worm", "chrUn:74980670-74998831", 1, 10, \ "C. briggsae");' \ | hgsql -h genome-testdb hgcentraltest # If you need to delete that entry: echo 'delete from dbDb where name="cb1";' \ | hgsql -h genome-testdb hgcentraltest # Make trackDb table so browser knows what tracks to expect: cd ~/kent/src/hg/makeDb/trackDb cvs up -d -P # Edit that makefile to add cb1 in all the right places and do make update # before you make alpha you must commit your trackDb/makefile # and any other trackDb/worm files cvs commit makefile make alpha # CREATE REPEAT TRACKS (2003-06-10 - DONE - Hiram) ssh eieio cd ~/cb1 # merge and lift up the repeatmasker output files to chrom # coordinates liftUp Un/chrUn.fa.out Un/chrUn.lft warn RMRun.ultras/out/*.out # load into the database (chrUn_rmsk table) ssh hgwdev hgLoadOut cb1 Un/chrUn.fa.out # lift the simple repeat output to chrom coordinates ssh eieio cd ~/cb1 cd bed/simpleRepeat liftUp simpleRepeat.bed ~/cb1/Un/chrUn.lft warn trf/*.bed # load into the database (simpleRepeat table) ssh hgwdev cd ~/cb1/bed/simpleRepeat hgLoadBed cb1 simpleRepeat simpleRepeat.bed \ -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql # Loaded 32763 elements # MAKING AND STORING mRNA AND EST ALIGNMENTS (DONE 2003-06-12 - Hiram) # next machine ssh kkr1u00 mkdir -p /iscratch/i/worms/Cbriggsae/mRNA mkdir -p /iscratch/i/worms/Cbriggsae/EST cd /iscratch/i/worms/Cbriggsae/mRNA # there are 33 sequences in mrna.fa, break up byname, each one to a file # (the 100 argument is necessary, but ignored) faSplit byname \ /cluster/store5/mrna.134/org/Caenorhabditis_briggsae/mrna.fa 100 cd /iscratch/i/worms/Cbriggsae/EST # there are 2424 ESTs. (The 100 argument is necessary but ignored) faSplit byname \ /cluster/store5/mrna.134/org/Caenorhabditis_briggsae/est.fa 100 ~kent/bin/iSync # next machine, small cluster is fine, there are few tiny jobs ssh kkr1u00 cd ~/cb1/bed mkdir mrna est cd mrna mkdir psl ls -1S /iscratch/i/worms/Cbriggsae/bothMasksNib/chr*.nib > worm.lst ls -1S /iscratch/i/worms/Cbriggsae/mRNA/*.fa > mrna.lst cat << '_EOF_' > gsub #LOOP /cluster/bin/i386/blat -fine -q=rna -mask=lower {check in exists+ $(path1)} {check in line+ $(path2)} {check out line+ psl/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' gensub2 worm.lst mrna.lst gsub spec para create spec para try para check para push # there are only 33 jobs ... etc ... # Average job time: 22s 0.36m 0.01h 0.00d # Longest job: 25s 0.42m 0.01h 0.00d # cluster run done pslSort dirs raw.psl /tmp psl pslReps -minAli=0.98 -sizeMatters -nearTop=0.005 raw.psl all_mrna.psl \ /dev/null pslSortAcc nohead chrom /tmp all_mrna.psl # load mrna tables # next machine ssh hgwdev cd ~/cb1/bed/mrna/chrom # names must be chr*_mrna.psl, there is only one mv chrUn.psl chrUn_mrna.psl # this load command rewrites these table entirely if they exist hgLoadPsl -noTNameIx cb1 *.psl mkdir /gbdb/cb1/mrna.134 ln -s /cluster/store5/mrna.134/org/Caenorhabditis_briggsae/mrna.fa \ /gbdb/cb1/mrna.134 # ! ! ! DO NOT RUN hgLoadRna in /gbdb - it leaves .tab files hgLoadRna add -type=mRNA cb1 /gbdb/cb1/mrna.134/mrna.fa \ /cluster/store5/mrna.134/org/Caenorhabditis_briggsae/mrna.ra cd ~/cb1/bed/mrna hgLoadPsl cb1 all_mrna.psl # /iscratch/i was setup with est.fa file above # Let's try the bluearc server ssh kk cd ~/cb1/bed/est mkdir psl ls -1S /cluster/bluearc/iscratch/i/worms/Cbriggsae/bothMasksNib/chr*.nib > worm.lst # names are too long for an ls all the way over in /cluster/bluearc ln -s /cluster/bluearc/iscratch/i/worms/Cbriggsae/EST cbaEST ls -1S cbaEST/* > est.lst cat << '_EOF_' > gsub #LOOP /cluster/bin/i386/blat -mask=lower {check in exists+ $(path1)} {check in line+ $(path2)} {check out line+ psl/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' gensub2 worm.lst est.lst gsub spec para create spec para try para check para push # Average job time: 80s 1.33m 0.02h 0.00d # Longest job: 537s 8.95m 0.15h 0.01d # I had trouble getting the cluster to finish these jobs # They would appear to be running, but when I checked on them, they # were not accumulating any CPU time. Had to stop and re-push a # number of times to get them all to complete # cluster run done # next machine ssh hgwdev cd ~/cb1/bed/est pslSort dirs raw.psl /tmp psl pslReps -minAli=0.98 -sizeMatters -nearTop=0.005 raw.psl all_est.psl \ /dev/null pslSortAcc nohead chrom /tmp all_est.psl cd chrom # name must be chr*_est.psl mv chrUn.psl chrUn_est.psl # this load command rewrites these table entirely if they exist hgLoadPsl -noTNameIx cb1 *.psl ln -s /cluster/store5/mrna.134/org/Caenorhabditis_briggsae/est.fa \ /gbdb/cb1/mrna.134 # ! ! ! DO NOT RUN hgLoadRna in /gbdb - it leaves .tab files hgLoadRna add -mrnaType=EST -type=EST cb1 /gbdb/cb1/mrna.134/est.fa \ /cluster/store5/mrna.134/org/Caenorhabditis_briggsae/est.ra cd ~/cb1/bed/est hgLoadPsl cb1 all_est.psl -nobin # END ==== MAKING AND STORING mRNA AND EST ALIGNMENTS # BEGINNING OF Ce1 BLASTZ (DONE 2003-6-17 - Hiram) # next machine ssh kk mkdir -p ~/cb1/bed/blastzCe1 cd ~/cb1/bed/blastzCe1 cp ~angie/hummus/DEF.cb1-ce1.2003-06-17 DEF # source the DEF file which was carefully written as above. . ./DEF mkdir -p $BASE/run ~angie/hummus/make-joblist $DEF > $BASE/run/j # that created xdir.sh and joblist run/j sh $BASE/xdir.sh # xdir.sh makes a bunch of result directories in $BASE/raw/ # based on chrom name and CHUNK size cd $BASE/run # now edit j because it is not correct: sed -e 's#^#/cluster/home/angie/schwartzbin/#' j > j2 wc -l j* head j2 # *** make sure the j2 edits are OK, then use it: mv j2 j # para create will create the file: 'batch' for the cluster run para create j para try para check para push ... etc ... Average job time: 1485s 24.75m 0.41h 0.02d Longest job: 3821s 63.68m 1.06h 0.04d # When that cluster run is done, results are in $BASE/raw/chr*/* # continuing with ~angie/hummus/Notes: # --- normalize and single_cov cd ~/cb1/bed/blastzCe1 # source the DEF file again in case you are coming back to this . ./DEF # a new run directory mkdir -p $BASE/run.1 # another obscure script creates a new job list: ~angie/hummus/do.out2lav $DEF >$BASE/run.1/j cd $BASE/run.1 # the job list is once again incorrect, edit it: sed -e 's/^/\/cluster\/home\/angie\/schwartzbin\//' j > j2 wc -l j* head j2 # make sure the edited j2 is OK, then use it: mv j2 j para create j para try para push;para check; ... etc ... # Average job time: 412s 6.86m 0.11h 0.00d # Longest job: 585s 9.75m 0.16h 0.01d # Translate the .lav files created by the end of ~angie/hummus Notes # into axt files ssh eieio set base="/cluster/store5/worm/cb1/bed/blastzCe1" set seq1_dir="/cluster/store5/worm/cb1/nib" set seq2_dir="/cluster/store5/worm/ce1/nib" set tbl="blastzCe1" cd $base mkdir -p axtChrom foreach c (lav/*) pushd $c set chr=$c:t set out=$base/axtChrom/$chr.axt echo "Translating $chr lav to $out" cat `ls -1 *.lav | sort -g` \ | /cluster/bin/i386/lavToAxt stdin $seq1_dir $seq2_dir stdout \ | /cluster/bin/i386/axtSort stdin $out popd end # Translate the sorted axt files into psl: cd $base mkdir -p pslChrom foreach f (axtChrom/chr*.axt) set c=$f:t:r nice /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl end # load these blastz results # next machine ssh hgwdev cd ~/cb1/bed/blastzCe1/pslChrom /cluster/bin/i386/hgLoadPsl -noTNameIx cb1 chr*_*.psl # trackDb/worm/cb1/trackDb.ra entry: # track blastzCe1 # shortLabel briggsae Blastz # longLabel Blastz C briggsae # group compGeno # priority 159 # visibility dense # color 0,0,0 # altColor 50,128,50 # spectrum on # type psl xeno cb1 # CHAINING Briggsae blastz # next machine, small cluster is good for these tiny jobs ssh kkr1u00 cd /cluster/store5/worm/cb1/bed/blastzCe1 mkdir axtChain cd axtChain mkdir run1 cd run1 ls -1S ../../axtChrom/*.axt > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out} #ENDLOOP '_EOF_' cat << '_EOF_' > doChain #!/bin/csh axtChain $1 /cluster/store5/worm/cb1/nib /cluster/store5/worm/ce1/nib $2 > $3 '_EOF_' chmod a+x doChain mkdir out chain gensub2 input.lst single gsub spec para create spec para try Average job time: 3175s 52.92m 0.88h 0.04d Longest job: 3175s 52.92m 0.88h 0.04d # now on the cluster server, sort chains # next machine ssh eieio cd ~/cb1/bed/blastzCe1/axtChain chainMergeSort run1/chain/*.chain > all.chain chainSplit chain all.chain # optionally: rm run1/chain/*.chain # Load chains into database # next machine ssh hgwdev cd ~/cb1/bed/blastzCe1/axtChain/chain hgLoadChain cb1 chrUn_ce1Chain chrUn.chain # END Blastz C Elegans # 2004-05-07 # changed priorities and names for Ce1 blastz and chain tracks # trackDb/worm/cb1/trackDb.ra entry: # track blastzCe1 # shortLabel Ce1 Blastz # longLabel Blastz C. elegans (Ce1) # group compGeno # priority 156.2 # visibility dense # color 0,0,0 # altColor 50,128,50 # spectrum on # type psl xeno ce1 # otherDb ce1 # track ce1Chain # shortLabel Ce1 Chain # longLabel Chained elegans(Ce1)/briggsae Alignments # group compGeno # priority 156.1 # visibility dense # color 100,50,0 # altColor 255,240,200 # spectrum on # type chain ce1 # otherDb ce1 # C. elegans (Ce2) Blastz (DONE, 2004-05-06, hartera) # next machine ssh kkr1u00 # blastz requires lineage-specific repeats but there are none for the worms # so create empty files for each chromsome and iSync mkdir -p /iscratch/i/worms/Cbriggsae/linSpecRep.notinCelegans cd /iscratch/i/worms/Celegans2/linSpecRep.notinCelegans # create empty chrUn.out.spec file if there is not one already here mkdir -p /iscratch/i/worms/Celegans2/linSpecRep.notinCbriggsae cd /iscratch/i/worms/Celegans2/linSpecRep.notinCbriggsae # create empty chrI.out.spec and cp to chrN.out.spec for chrII, chrIII, chrIV, chrV, chrX, chrM if not there alreday iSync ssh kk mkdir -p /cluster/data/cb1/bed/blastzCe2.2004-05-05 cd /cluster/data/cb1/bed/ ln -s blastzCe2.2004-05-05 blastzCe2 cd blastzCe2 cat << '_EOF_' > DEF # C. briggsae vs. C. elegans export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/home/angie/schwartzbin:/cluster/bin/i386 ALIGN=blastz-run BLASTZ=blastz BLASTZ_H=2000 #BLASTZ_ABRIDGE_REPEATS=1 # when SMSK=/dev/null BLASTZ_ABRIDGE_REPEATS=0 # TARGET SEQ1_DIR=/iscratch/i/worms/Cbriggsae/bothMasksNib # RMSK not currently used SEQ1_RMSK=/iscratch/i/worms/Cbriggsae/rmsk SEQ1_SMSK=/iscratch/i/worms/Cbriggsae/linSpecRep.notinCelegans # FLAG not currently used SEQ1_FLAG=-worm SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY SEQ2_DIR=/iscratch/i/worms/Celegans2/nib # RMSK not currently used SEQ2_RMSK=/iscratch/i/worms/Celegans2/rmsk SEQ2_SMSK=/iscratch/i/worms/Celegans2/linSpecRep.notinCbriggsae # FLAG not currently used SEQ2_FLAG=-worm SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=10000000 SEQ2_LAP=10000 BASE=/cluster/store5/worm/cb1/bed/blastzCe2 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len #DEBUG=1 '_EOF_' # << this line keeps emacs coloring happy # Save the DEF file in the current standard place cp DEF ~angie/hummus/DEF.cb1-ce2.2004-05-05 mkdir /cluster/data/cb1/jkStuff # Need shell scripts from ce2 to do cluster runs mv /cluster/data/ce2/jkStuff/BlastZ*.sh /cluster/data/cb1/jkStuff/ # prepare first cluster run ssh kk cd /cluster/data/cb1/bed/blastzCe2 source DEF /cluster/data/cb1/jkStuff/BlastZ_run0.sh cd run.0 para try, check, push, check, .... # para time # Completed: 154 of 154 jobs # CPU time in finished jobs: 181038s 3017.31m 50.29h 2.10d 0.006 y # IO & Wait Time: 2433s 40.54m 0.68h 0.03d 0.000 y # Average job time: 1191s 19.86m 0.33h 0.01d # Longest job: 3365s 56.08m 0.93h 0.04d # Submission to last job: 5981s 99.68m 1.66h 0.07d # Second cluster run to convert the .out's to .lav's cd /cluster/data/cb1/bed/blastzCe2 source DEF /cluster/data/cb1/jkStuff/BlastZ_run1.sh cd run.1 para try, check, push, etc ... # para time # Completed: 11 of 11 jobs # CPU time in finished jobs: 973s 16.21m 0.27h 0.01d 0.000 y # IO & Wait Time: 57s 0.96m 0.02h 0.00d 0.000 y # Average job time: 94s 1.56m 0.03h 0.00d # Longest job: 125s 2.08m 0.03h 0.00d # Submission to last job: 125s 2.08m 0.03h 0.00d # Third cluster run to convert lav's to axt's cd /cluster/data/cb1/bed/blastzCe2 source DEF /cluster/data/cb1/jkStuff/BlastZ_run2.sh cd run.2 para try para check # para time # CPU time in finished jobs: 240s 4.00m 0.07h 0.00d 0.000 y # IO & Wait Time: 359s 5.99m 0.10h 0.00d 0.000 y # Average job time: 599s 9.98m 0.17h 0.01d # Longest job: 599s 9.98m 0.17h 0.01d # Submission to last job: 599s 9.98m 0.17h 0.01d # translate sorted axt files into psl ssh eieio cd /cluster/data/cb1/bed/blastzCe2 mkdir -p pslChrom set tbl = "blastzCe2" foreach f (axtChrom/chr*.axt) set c=$f:t:r echo "Processing chr $c" /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl end # Load database tables ssh hgwdev cd /cluster/data/cb1/bed/blastzCe2/pslChrom /cluster/bin/i386/hgLoadPsl -noTNameIx cb1 chrUn_blastzCe2.psl # alignments have gone from 1668769 for Ce1 to 1256139 for Ce2 # trackDb/worm/cb1/trackDb.ra entry: # track blastzCe2 # shortLabel Elegans Blastz # longLabel Blastz C. elegans # group compGeno # priority 155.2 # visibility dense # color 0,0,0 # altColor 50,128,50 # spectrum on # type psl xeno ce2 # otherDb ce2 # CHAINING C. elegans blastz for Ce2 [DONE, 2004-05-06, hartera] # next machine ssh kk mkdir -p /cluster/data/cb1/bed/blastzCe2/axtChain/run1 cd /cluster/data/cb1/bed/blastzCe2/axtChain/run1 mkdir out chain ls -1S /cluster/data/cb1/bed/blastzCe2/axtChrom/*.axt > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy cat << '_EOF_' > doChain #!/bin/csh axtFilter $1 | axtChain stdin \ /iscratch/i/worms/Cbriggsae/bothMasksNib \ /iscratch/i/worms/Celegans2/nib $2 >& $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain gensub2 input.lst single gsub jobList para create jobList # only 1 job so all done by para try para try para check # para time # CPU time in finished jobs: 1649s 27.48m 0.46h 0.02d 0.000 y # IO & Wait Time: 15s 0.26m 0.00h 0.00d 0.000 y # Average job time: 1664s 27.73m 0.46h 0.02d # Longest job: 1664s 27.73m 0.46h 0.02d # Submission to last job: 1664s 27.73m 0.46h 0.02d # now on the file server, sort chains ssh eieio cd /cluster/data/cb1/bed/blastzCe2/axtChain time chainMergeSort run1/chain/*.chain > all.chain # User 48.840u # System 3.530s # Elapsed Real 0:54.94 time chainSplit chain all.chain # User 49.300u # System 3.700s # Elapsed Real 0:55.55 # Load chains into database # next machine ssh hgwdev cd /cluster/data/cb1/bed/blastzCe2/axtChain/chain hgLoadChain cb1 chrUn_chainCe2 chrUn.chain # trackDb/worm/cb1/trackDb.ra entry: # track chainCe2 # shortLabel Elegans Chain # longLabel Chained elegans/briggsae Alignments # group compGeno # priority 155.1 # visibility dense # color 100,50,0 # altColor 255,240,200 # spectrum on # type chain ce2 # ADD TWINSCAN PREDICTIONS (DONE, 2004-11-30, hartera) # Provided by Chaochun Wei: wei@cse.wustl.edu # These are gene predictions produced by Iscan (new version of Twinscan) # C. briggsae/C. elegans (version WS98) homology used for predictions. ssh eieio mkdir -p /cluster/data/cb1/bed/twinscan cd /cluster/data/cb1/bed/twinscan wget --timestamp \ http://genes.cs.wustl.edu/predictions/worm/C_briggsae/cb25.agp8_11_19_2004/cb25.agp8_11_19_2004.tgz tar xvzf cb25.agp8_11_19_2004.tgz # Add '.a' to end of protein fasta id's, to match gtf transcript_id's: cp /dev/null twinscanPep.fa perl -wpe 's/^(>\S+).*/$1.a/' chr_ptx/*.ptx > twinscanPep.fa # need to do lift up from contigs to chrUn cat ./chr_gtf/cb*.gtf >> cbAll.gtf liftUp cbAll.lifted.gtf /cluster/data/cb1/Un/chrUn.lft warn cbAll.gtf # Got 1156 lifts in /cluster/data/cb1/Un/chrUn.lft # load. ssh hgwdev cd /cluster/data/cb1/bed/twinscan ldHgGene -gtf -genePredExt cb1 twinscan cbAll.lifted.gtf # Read 25001 transcripts in 175327 lines in 1 files # 25001 groups 1 seqs 1 sources 3 feature types # 25001 gene predictions hgPepPred cb1 generic twinscanPep twinscanPep.fa # Add track to trackDb.ra for cb1 ####################################################################### # TWINSCAN CLEANUP (DONE, 2007-06-25, hartera) ssh kkstore02 cd /cluster/store5/worm/cb1/bed/twinscan # remove cbAll.gtf as same as in chr_gtf directory rm cbAll.gtf rm *.tab # gzip other files gzip chr_gtf/*.gtf chr_ptx/*.ptx chr_tx/*.tx ####################################################################### # Ce2 BLASTZ CLEANUP (DONE, 2007-06-25, hartera) ssh kkstore02 cd /cluster/store5/worm/cb1/bed/blastzCe2.2004-05-05/pslChrom rm psl.tab gzip chrUn_blastzCe2.psl cd /cluster/store5/worm/cb1/bed/blastzCe2/lav/chrUn gzip *.lav cd /cluster/store5/worm/cb1/bed/blastzCe2/axtChain gzip all.chain # this is the same as the chrUn.chain since there is only one chrom # so the chain directory can be removed. rm -r chain cd run1/chain/ gzip chrUn.chain cd .. rm -r err rm batch batch.bak para.results cd /cluster/store5/worm/cb1/bed/blastzCe2/axtChrom gzip chrUn.axt ####################################################################### # redo genbank alignments, as update was never enabled. (2007-11-15 markd) cd kent/src/hg/makeDb/genbank # edit etc/genbank.conf and add cb1.refseq.mrna.xeno.load = yes cvs commit etc make etc-update # do alignment ssh genbank cd /cluster/data/genbank ((nice ./bin/gbAlignStep -initial cb1)|&mail markd&) # the next day.. ssh hgwdev cd /cluster/data/genbank ((nice ./bin/gbDbLoadStep -drop -initialLoad cb1)|&mail markd&)