# for emacs: -*- mode: sh; -*- # This file describes how to make the browser database for the # worm C. elegans ########################################################################### # DOWNLOAD SEQUENCE (DONE, 2005-04-29, hiram) ssh eieio mkdir /cluster/store5/worm/ce3 cd /cluster/store5/worm/ce3 mkdir WS140 cd WS140 for F in composition.all masked totals do FTP="ftp://ftp.wormbase.org/pub/wormbase/elegans/WS140/CHROMOSOMES/" wget "${FTP}/${F}" -O ${F} done # while fetching chrM, rename it from CHROMOSOME_MtDNA to # CHROMOSOME_M to match the pattern of the others FTP="ftp://ftp.wormbase.org/pub/wormbase/elegans/WS140/CHROMOSOMES/" wget "${FTP}/CHROMOSOME_MtDNA.dna.gz" -O /dev/stdout | zcat | \ sed -e "s/MtDNA/M/" | gzip > chrM.fa.gz wget "${FTP}/CHROMOSOME_MtDNA.gff.gz" -O /dev/stdout | zcat | \ sed -e "s/CHROMOSOME_MtDNA/CHROMOSOME_M/g" | gzip > chrM.gff.gz # create agp file for chrM echo "M 1 13794 1 F X54252.1 1 13794 +" | sed -e "s/ */\t/g" > \ chrM.agp for C in I II III IV V X do FTP="ftp://ftp.wormbase.org/pub/wormbase/elegans/WS140/CHROMOSOMES/" wget --timestamping "${FTP}/CHROMOSOME_${C}.dna.gz" -O chr${C}.fa.gz wget --timestamping "${FTP}/CHROMOSOME_${C}.gff.gz" -O chr${C}.gff.gz wget --timestamping "${FTP}/CHROMOSOME_${C}.agp" -O chr${C}.agp echo "done: chr${C}" done faCount *.fa.gz # #seq len A C G T N cpg # CHROMOSOME_I 15080552 4838583 2697189 2693560 4851220 0 503702 # CHROMOSOME_II 15279311 4878196 2769211 2762192 4869712 0 492145 # CHROMOSOME_III 13783317 4444530 2449080 2466259 4423448 0 459652 # CHROMOSOME_IV 17493785 5711040 3034769 3017007 5730969 0 522372 # CHROMOSOME_V 20922231 6749806 3711722 3700960 6759743 0 638852 # CHROMOSOME_X 17718850 5746417 3119281 3118287 5734865 0 514715 # chrM 13794 4335 1225 2055 6179 0 110 # total 100291840 32372907 17782477 17760320 323761360 3131548 cd /cluster/store5/worm/ce3 # translate to unzipped .fa, all upper case, and # rename the agp files so hgGoldGap can find them mkdir sangerFa for C in I II III IV V X M do echo -n "${C} " zcat WS140/chr${C}.fa.gz | tr '[a-z]' '[A-Z]' | \ sed -e "s/CHROMOSOME_/chr/" > sangerFa/chr${C}.fa mkdir -p sangerFa/${C} ln -s ${C} sangerFa/chr${C} sed -e "s/^/chr/" `pwd`/WS140/chr${C}.agp > sangerFa/${C}/chr${C}.agp done # ln -s `pwd`/WS140/chr${C}.agp sangerFa/${C}/chr${C}.agp # verify faCount still has the same amount of sequence as listed above cd sangerFa faCount *.fa # you should see the same numbers # the assembly has no N's: grep N *.fa # shows nothing # set cluster data symlink for future reference: ssh hgwdev ln -s /cluster/store5/worm/ce3 /cluster/data/ce3 ########################################################################### # PREPARE Split contigs into 100,000 bp chunks for cluster runs # (DONE, 2004-04-29, Hiram) # next machine ssh eieio cd /cluster/data/ce3 rm -fr ./split mkdir split for C in I II III IV V X M do mkdir split/${C} faSplit size sangerFa/chr${C}.fa 100000 split/${C}/c -lift=split/chr${C}.lft done 151 pieces of 151 written 153 pieces of 153 written 138 pieces of 138 written 175 pieces of 175 written 210 pieces of 210 written 178 pieces of 178 written 1 pieces of 1 written cat split/*.lft > liftAll.lft # copy them to /iscratch/i for cluster rsync # next machine ssh kkr1u00 cd /cluster/data/ce3/split for C in I II III IV V X M do echo -n "${C} " mkdir -p /iscratch/i/worms/Celegans3/unmaskedSplit/${C} cp -p ${C}/c*.fa /iscratch/i/worms/Celegans3/unmaskedSplit/${C} done # this iSync takes forever these days because iscratch is so large /cluster/bin/iSync # instead, a simple rsync of just this business will get it done # rapidly for i in 2 3 4 5 6 7 8 do rsync -a --progress --delete --rsh=rsh /iscratch/i/worms/ \ kkr${i}u00:/iscratch/i/worms/ done ############################################################################ # Run RepeatMasker on the chromosomes (DONE, 2005-04-29 - Hiram) # next machine ssh kk cd /cluster/data/ce3 mkdir jkStuff # make run directory and job list, create the script to use # for the RepeatMasker run cat << '_EOF_' > jkStuff/RMWorm #!/bin/csh -fe # # This is a slight rearrangement of the # RMChicken script used in makeGalGal2.doc # The results here need to go to a different location # $1 == chrom name: I II III IV V X M # $2 == directory where split contig .fa is found # $3 == name of contig .fa file cd $1 pushd . cd $2 /bin/mkdir -p /tmp/ce3/$3/$1 /bin/cp $3 /tmp/ce3/$3/$1 cd /tmp/ce3/$3/$1 /cluster/bluearc/RepeatMasker050112/RepeatMasker -alignments -s -species elegans $3 popd /bin/cp /tmp/ce3/$3/$1/$3.out ./ if( -e /tmp/ce3/$3/$1/$3.align ) /bin/cp /tmp/ce3/$3/$1/$3.align ./ if (-e /tmp/ce3/$3/$1/$3.tbl) /bin/cp /tmp/ce3/$3/$1/$3.tbl ./ if (-e /tmp/ce3/$3/$1/$3.cat) /bin/cp /tmp/ce3/$3/$1/$3.cat ./ /bin/rm -r /tmp/ce3/$3/$1 /bin/rmdir --ignore-fail-on-non-empty /tmp/ce3/$3 /bin/rmdir --ignore-fail-on-non-empty /tmp/ce3 '_EOF_' # emacs happy chmod +x jkStuff/RMWorm # create job list mkdir RMRun rm -f RMRun/jobList for C in I II III IV V X M do mkdir /cluster/data/ce3/RMRun/${C} for T in /iscratch/i/worms/Celegans3/unmaskedSplit/$C/c*.fa do D=`dirname $T` F=`basename $T` echo /cluster/data/ce3/jkStuff/RMWorm ${C} ${D} ${F} \ '{'check out line+ /cluster/data/ce3/RMRun/$C/${F}.out'}' done >> RMRun/jobList done # Do the run cd /cluster/data/ce3/RMRun para create jobList para try, para check, para check, para push, para check, ... XXXX # STARTED 2005-04-29 14:28 # Completed: 1006 of 1006 jobs # CPU time in finished jobs: 1129254s 18820.90m 313.68h 13.07d 0.036 y # IO & Wait Time: 4664s 77.73m 1.30h 0.05d 0.000 y # Average job time: 1127s 18.79m 0.31h 0.01d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 1290s 21.50m 0.36h 0.01d # Submission to last job: 3575s 59.58m 0.99h 0.04d # when they are finished, liftUp and load the .out files into the database: # next machine ssh eieio cd /cluster/data/ce3/RMRun for C in I II III IV V X M do liftUp chr${C}.fa.out /cluster/data/ce3/split/chr${C}.lft warn ${C}/*.fa.out done ########################################################################## # CREATE DATABASE ssh eieio cd /cluster/data/ce3 faToTwoBit sangerFa/chr*.fa ce3.2bit twoBitInfo ce3.2bit stdout | sort -rn +1 > chrom.sizes grep -v random chrom.sizes | cut -f1 | sed -e "s/chr//" > chrom.lst twoBitInfo ce3.2bit stdout | awk '{printf "%s\t%s\t/gbdb/ce3/ce3.2bit\n", $1,$2}' > chromInfo.tab ssh hgwdev cd /cluster/data/ce3 hgsql -e "create database ce3;" mysql # Make sure we have enough room (eventually ~ 70Gb) for mysql tables: df -h | grep mysql # /dev/sda1 472G 230G 218G 52% /var/lib/mysql2 # /dev/sdc1 1.8T 904G 756G 55% /var/lib/mysql # CREATING GRP TABLE FOR TRACK GROUPING (DONE - 2005-03-10 - Hiram) # Use any of the newest databases to ensure that the organization # of the grp table is up to date ssh hgwdev cd /cluster/data/ce3 hgsql ce3 -e "create table grp (PRIMARY KEY(NAME)) select * from mm6.grp" hgsql ce3 < $HOME/kent/src/hg/lib/chromInfo.sql hgsql ce3 -e 'load data local infile "chromInfo.tab" into table chromInfo;' # Enter ce3 into dbDb and defaultDb so test browser knows about it: hgsql -e 'INSERT INTO dbDb (name, description, nibPath, organism, \ defaultPos, active, orderKey, genome, scientificName, \ htmlPath, hgNearOk, hgPbOk, sourceName) \ VALUES("ce3", "March 2005", "/gbdb/ce3", "C. elegans", \ "chrII:14647078-14667002", 1, 59, "C. elegans", \ "Caenorhabditis elegans", "/gbdb/ce3/html/description.html", 0, 0, \ "WormBase v. WS140");' -h localhost hgcentraltest # do this defaultDb entry later after there is something to see # on this browser. hgsql -e 'INSERT INTO defaultDb (name, genome) VALUES("ce3", "Mouse")' \ -h localhost hgcentraltest # start a new entry in the trackDb hierarchy ssh hgwdev # start a new entry in the trackDb hierarchy cd $HOME/kent/src/hg/makeDb/trackDb/worm mkdir ce3 cvs add ce3 cd ce3 cp ../ce3/description.html . vi description.html - fixup text for this assembly cvs add description.html cvs commit cd ../.. vi makefile - add ce3 to the list mkdir /cluster/data/ce3/html mkdir /gbdb/ce3 ln -s /cluster/data/ce3/html /gbdb/ce3/html ln -s /cluster/data/ce3/ce3.2bit /gbdb/ce3/ce3.2bit cp -p worm/ce3/description.html /gbdb/ce3/html make DBS=ce3 ZOO_DBS="" cd /cluster/data/ce3/RMRun hgLoadOut ce3 chr*.fa.out # bad rep range [1023, 233] line 2579 of chrII.fa.out # bad rep range [2914, 2908] line 2791 of chrIII.fa.out # This result produces: 13238621 bases, 101,554 items, %13.200098 ####################################################################### # an experiment to see if more repeat masked elements can be found # do a second repeat masker run, with a different split to help # cover some of the edges that the previous split may have made # that cause RM to not find items next to those edges. # # The result of this experiment is that splitting at the 100,000 # boundary hardly made much of a difference compared to a # completely unsplit result. So, we will go with the traditional # split at 100,000 and leave it at that. ####################################################################### ########################################################################### # PREPARE Split contigs into 103,000 bp chunks for a second RM run # This isn't perfect, there are a couple of breaks that are # exactly the same as before (seven) but all others are at least # 1000 different # (DONE, 2004-04-29, Hiram) # next machine ssh eieio cd /cluster/data/ce3 rm -fr ./split103 mkdir split103 for C in I II III IV V X M do mkdir split103/${C} faSplit size sangerFa/chr${C}.fa 103000 split103/${C}/c -lift=split103/chr${C}.lft done 147 pieces of 147 written 149 pieces of 149 written 134 pieces of 134 written 170 pieces of 170 written 204 pieces of 204 written 173 pieces of 173 written 1 pieces of 1 written cat split103/*.lft > liftAll103.lft # copy them to /iscratch/i for cluster rsync # next machine ssh kkr1u00 cd /cluster/data/ce3/split103 for C in I II III IV V X M do echo -n "${C} " mkdir -p /iscratch/i/worms/Celegans3/split103/${C} cp -p ${C}/c*.fa /iscratch/i/worms/Celegans3/split103/${C} done # this iSync takes forever these days because iscratch is so large /cluster/bin/iSync # instead, a simple rsync of just this business will get it done # rapidly for i in 2 3 4 5 6 7 8 do rsync -a --progress --delete --rsh=rsh /iscratch/i/worms/ \ kkr${i}u00:/iscratch/i/worms/ done for C in I II III IV V X M do mkdir /cluster/data/ce3/RMFull/${C} for T in /iscratch/i/worms/Celegans3/unmaskedFa/chr${C}.fa do D=`dirname $T` F=`basename $T` echo /cluster/data/ce3/jkStuff/RMWorm ${C} ${D} ${F} \ '{'check out line+ /cluster/data/ce3/RMFull/$C/${F}.out'}' done >> RMFull/jobList done # Do the run ############################################################################ # Second RepeatMasker on different split size # next machine ssh kk cd /cluster/data/ce3 # create job list mkdir RMRun103 rm -f RMRun103/jobList for C in I II III IV V X M do mkdir /cluster/data/ce3/RMRun103/${C} for T in /iscratch/i/worms/Celegans3/split103/$C/c*.fa do D=`dirname $T` F=`basename $T` echo /cluster/data/ce3/jkStuff/RMWorm ${C} ${D} ${F} \ '{'check out line+ /cluster/data/ce3/RMRun103/$C/${F}.out'}' done >> RMRun103/jobList done # Do the run cd /cluster/data/ce3/RMRun103 para create jobList para try, para check, para check, para push, para check, ... # STARTED 2005-04-29 15:43 # Completed: 978 of 978 jobs # CPU time in finished jobs: 1129788s 18829.81m 313.83h 13.08d 0.036 y # IO & Wait Time: 5530s 92.16m 1.54h 0.06d 0.000 y # Average job time: 1161s 19.35m 0.32h 0.01d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 1313s 21.88m 0.36h 0.02d # Submission to last job: 2737s 45.62m 0.76h 0.03d # when they are finished, liftUp and load the .out files into the database: # next machine ssh eieio cd /cluster/data/ce3/RMRun103 for C in I II III IV V X M do liftUp chr${C}.fa.out /cluster/data/ce3/split103/chr${C}.lft warn ${C}/*.fa.out done # next machine ssh hgwdev cd /cluster/data/ce3/RMRun103 hgLoadOut ce3 chr*.fa.out bad rep range [1023, 233] line 2595 of chrII.fa.out bad rep range [2914, 2908] line 2797 of chrIII.fa.out # This result produces: 13236863 bases, 101,529 items, %13.198345 ####################################################################### # The same RM run was done a third time on whole un-split chrom # files. This run took 35 hours on the Iservers kluster. # Loading these full chrom results: # Not split: 13,238,684 bases, 101,565 items, %13.200161 # Split at 100,000 bases: 13,238,621 bases, 101,554 items, %13.200098 # Split at 103,000 bases: 13,236,863 bases, 101,529 items, %13.198345 # Even the full chroms had problems with coordinates: # bad rep range [1023, 233] line 2586 of chrII.fa.out # bad rep range [2914, 2908] line 2792 of chrIII.fa.out # Combining the two split results: # 13,283,266 bases, 103,117 items, %13.244613 ####################################################################### # Most interestingly, the hgLoadOut would *not* load the files # unless they had been lifted. It said they were not valid .out # files ! So, a fake split at 30,000,000 to create some lft # files: (30,000,000 is too large to cause anything to split) for C in I II III IV V X M do mkdir splitFull/${C} faSplit size sangerFa/chr${C}.fa 30000000 splitFull/${C}/c -lift=splitFull/chr${C}.lft done # ####################################################################### # SIMPLE REPEAT [TRF] TRACK (DONE - 2005-05-02 - Hiram) ssh kki mkdir -p /cluster/data/ce3/bed/simpleRepeat cd /cluster/data/ce3/bed/simpleRepeat mkdir trf ls -1S /iscratch/i/worms/Celegans3/unmaskedFa/chr*.fa > genome.lst cat << '_EOF_' > gsub #LOOP /cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf {check in line+ $(path1)} /dev/null -bedAt={check out line trf/$(root1).bed} -tempDir=/tmp #ENDLOOP '_EOF_' # happy emacs gensub2 genome.lst single gsub jobList para create jobList para try # there are only 7, so this runs them all # Completed: 7 of 7 jobs # CPU time in finished jobs: 1751s 29.18m 0.49h 0.02d 0.000 y # IO & Wait Time: 28s 0.47m 0.01h 0.00d 0.000 y # Average job time: 254s 4.24m 0.07h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 545s 9.08m 0.15h 0.01d # Submission to last job: 545s 9.08m 0.15h 0.01d # When cluster run is done, combine into one: cat trf/*.bed > simpleRepeat.bed # Load into the database: # next machine ssh hgwdev cd /cluster/data/ce3/bed/simpleRepeat hgLoadBed ce3 simpleRepeat simpleRepeat.bed \ -sqlTable=$HOME/src/hg/lib/simpleRepeat.sql # Loaded 28637 elements of size 16 ####################################################################### # PROCESS SIMPLE REPEATS AND RMSK INTO MASK (DONE, 2005-05-02 - Hiram) # After the simpleRepeats track has been built, make a filtered version # of the trf output: keep trf's with period <= 12: ssh eieio cd /cluster/data/ce3/bed/simpleRepeat mkdir -p trfMask for F in trf/*.bed do T=${F#trf/} echo "${F} > trfMask/${T}" awk '{if ($5 <= 12) print;}' ${F} > trfMask/${T} done # create Soft and Hard masks from RepeatMaster and TRF outputs: # and rebuild the 2bit file using the soft masking in the fa. # Might need the nibs for something, so make those too. # next machine ssh eieio cd /cluster/data/ce3 mkdir softMask mkdir nib cd /cluster/data/ce3 for C in I II III IV V X M do echo -n "masking chr${C} " maskOutFa sangerFa/chr${C}.fa RMRun/chr${C}.fa.out \ softMask/chr${C}.fa -soft maskOutFa softMask/chr${C}.fa \ bed/simpleRepeat/trfMask/chr${C}.bed \ softMask/chr${C}.fa -softAdd faToNib -softMask softMask/chr${C}.fa nib/chr${C}.nib done # masking chrI Writing 15080552 bases in 7540284 bytes # masking chrII Writing 15279311 bases in 7639664 bytes # masking chrIII Writing 13783317 bases in 6891667 bytes # masking chrIV Writing 17493785 bases in 8746901 bytes # masking chrV Writing 20922231 bases in 10461124 bytes # masking chrX Writing 17718850 bases in 8859433 bytes # masking chrM Writing 13794 bases in 6905 bytes # re-create the 2bit file rm -f ce3.2bit faToTwoBit softMask/chr*.fa ce3.2bit # create hard masks mkdir hardMask for C in I II III IV V X M do echo "masking chr${C}" /cluster/bin/i386/maskOutFa softMask/chr${C}.fa hard \ hardMask/chr${C}.fa done # copy to iscratch for cluster runs ssh kkr1u00 cd /cluster/data/ce3/softMask mkdir -p /iscratch/i/worms/Celegans3/bothMasksFa mkdir -p /iscratch/i/worms/Celegans3/nib cp -p *.fa /iscratch/i/worms/Celegans3/bothMasksFa cd /cluster/data/ce3/nib cp -p c*.nib /iscratch/i/worms/Celegans3/nib cd /iscratch/i/worms/Celegans3 cp -p /cluster/data/ce3/ce3.2bit . # no longer need these items rm -fr unmaskedSplit split103 unmaskedFa for i in 2 3 4 5 6 7 8 do rsync -a --progress --delete --rsh=rsh /iscratch/i/worms/ \ kkr${i}u00:/iscratch/i/worms/ done ####################################################################### # GOLD AND GAP TABLES (DONE - 2005-05-02 - Hiram) ssh hgwdev cd /cluster/data/ce3 awk '{print $1}' chrom.sizes | sed -e "s/chr//" > chrom.lst grep -v random chrom.sizes | cut -f1 | sed -e "s/chr//" > chrom.lst grep -v random chrom.sizes | cut -f1 > chrom.lst hgGoldGapGl -noGl -chromLst=chrom.lst ce3 /cluster/data/ce3 sangerFa # # There are no gaps in this assembly, the gap tables are all empty # ####################################################################### # GC5BASE (DONE - 2004-12-08 - Hiram) ssh eieio mkdir /cluster/data/ce3/bed/gc5Base cd /cluster/data/ce3/bed/gc5Base hgGcPercent -wigOut -doGaps -file=stdout -win=5 ce3 \ /cluster/data/ce3 | wigEncode stdin gc5Base.wig gc5Base.wib ssh hgwdev cd /cluster/data/ce3/bed/gc5Base mkdir /gbdb/ce3/wib ln -s `pwd`/gc5Base.wib /gbdb/ce3/wib hgLoadWiggle ce3 gc5Base gc5Base.wig ########################################################################### # CREATE SANGER LINKS TABLE FROM WORMBASE INFO (used by hgGene and hgNear) # FETCH acedb database for construction of Sanger Links table # DONE - 2006-01-06 - 2006-01-09 - Hiram ssh kkstore02 mkdir /cluster/data/ce3/bed/acedb cd /cluster/data/ce3/bed/acedb wget --timestamping --force-directories --directory-prefix=. \ --dont-remove-listing --recursive --level=4 --no-parent \ --no-host-directories --cut-dirs=4 \ ftp://ftp.sanger.ac.uk/pub/wormbase/FROZEN_RELEASES/WS140/ # Then, to install this acedb database: chmod +x INSTALL ./INSTALL # It asks a question about disk space, you should have at least # 15 Mb free, answer the question with: yes # if that is true # Start up the AceDb monitor mkdir /cluster/data/ce3/bed/sangerLinks cd /cluster/data/ce3/bed/sangerLinks # fetch from our version of uniProt, name relationships # acc == uniProt ID # extAcc1 == WormBase Gene number # extAcc2 == orfName or Gene Class name # extAcc3 == haven't seen anything in this column yet hgsql -N -e \ "select acc,extAcc1,extAcc2,extAcc3 from extDbRef where extDb=19;" uniProt \ > uniProt.Celegans.txt tar xvzf ../acedb/wormpep140.tar.gz wormpep140/wormpep140 zcat ../acedb/geneIDs.WS140.gz > geneIDs.WS140.txt /cluster/bin/acedb/tace /cluster/data/ce3/bed/acedb # At prompt, can type ? for help, type the following queries # wbGeneClass.txt - associates 3 letter first part of worm gene # name with a brief description. acedb> AQL -o wbGeneClass.txt select l,l->Description from l in class Gene_Class # it says: # // 1471 Active Objects # wbLongDescr.txt - associates worm gene name with a several # sentence description. acedb> AQL -o wbLongDescr.txt select l,l->Gene_info[Provisional_description] from l in class Gene # it says: # // 5296 Active Objects # ctrl-D to exit the acedb monitor # remove header lines and "" from those two files: grep -v "^Format" wbGeneClass.txt | \ sed -e 's/Gene_class://; s/Text://; s/"//g;' \ > wbGeneClass.new grep -v "^Format" wbLongDescr.txt | \ sed -e 's/Gene://; s/Text://; s/"//g;' \ > wbLongDescr.new mv wbGeneClass.new wbGeneClass.txt mv wbLongDescr.new wbLongDescr.txt # Check how we compare to previous build, there will probably be a # few more with each new build: (I don't know why the bigger one # is so much larger this time) wc wbGeneClass.txt wbLongDescr.txt # 1515 6589 47870 wbGeneClass.txt # 50417 421524 3220249 wbLongDescr.txt wc /cluster/data/ce2/bed/steinHelp/wb*.txt # 1393 5982 43094 /cluster/data/ce2/bed/steinHelp/wbGeneClass.txt # 18400 197407 1378516 /cluster/data/ce2/bed/steinHelp/wbConcise.txt # There is a copy of this script in the source tree: # /kent/src/hg/near/hgWormLinks/ # There is also an hgWormLinks program there, with slightly # different inputs and outputs. ./mkSangerLinks.pl wormpep140/wormpep.table140 geneIDs.WS140.txt \ wbLongDescr.txt wbGeneClass.txt uniProt.Celegans.txt \ > sangerLinks.txt 2> dbg.out # should be relatively few names without descriptions: grep "no descr" dbg.out | wc # 1214 8498 67382 ssh hgwdev cd /cluster/data/ce3/bed/sangerLinks hgsql ce3 < $HOME/kent/src/hg/lib/sangerLinks.sql hgsql ce3 -e \ 'load data local infile "sangerLinks.txt" into table sangerLinks;' # the index for protName doesn't seem to get populated hgsql -e "analyze table sangerLinks;" ce3 # should now see numbers (happen to be identical) for each of the # indices: hgsql -e "show index from sangerLinks;" ce3 hgsql ce3 < /cluster/home/hiram/kent/src/hg/lib/orfToGene.sql hgsql ce3 -e 'load data local infile "orfToGene.tab" into table orfToGene;' ####################################################################### # MAKE SANGER GENE (WORM BASE GENES) TRACK (WORKING, 2005-05-02, Hiram) ssh eieio mkdir -p /cluster/data/ce3/bed/sangerGene cd /cluster/data/ce3/bed/sangerGene # the perl trims these files down to a reasonable size. As they are # they cause ldHgGene to hangup due to memory size. # The sed removes CDS before id e.g. from CDS "Y74C9A.2" and the # extra quote ", and finally the comments at the ends of the lines # which start with `blank semi-colon blank' for F in I II III IV V X M do echo -n "${F} " zcat ../../WS140/chr${F}.gff.gz | \ sed -e "s/CHROMOSOME_III/chrIII/g" -e "s/CHROMOSOME_II/chrII/g" \ -e "s/CHROMOSOME_IV/chrIV/g" -e "s/CHROMOSOME_I/chrI/g" \ -e "s/CHROMOSOME_X/chrX/g" -e "s/CHROMOSOME_V/chrV/g" \ -e "s/CHROMOSOME_M/chrM/g" \ -e 's/Sequence "\(.*\)"$/\1/' -e 's/Transcript "\(.*\)"$/\1/' | \ perl -ne '@a=split; print if($a[1] =~ m/curated|DNA|RNA/i && $a[2] =~ m/intron|exon|cds|sequence|transcri/i);' | \ sed -e "s/CDS \"//; s/\"//; s/ ; .*//;" > chr${F}.gff done # check file sizes, should be reasonable ls -ogrt # -rw-rw-r-- 1 3524403 May 2 12:47 chrI.gff # -rw-rw-r-- 1 3732465 May 2 12:48 chrII.gff # -rw-rw-r-- 1 3295374 May 2 12:49 chrIII.gff # -rw-rw-r-- 1 3635497 May 2 12:51 chrIV.gff # -rw-rw-r-- 1 4754926 May 2 12:53 chrV.gff # -rw-rw-r-- 1 3896221 May 2 12:55 chrX.gff # -rw-rw-r-- 1 3881 May 2 12:55 chrM.gff # now load database with those transformed gff files # next machine ssh hgwdev cd /cluster/data/ce3/bed/sangerGene # 2004-05-10, hartera, Reload sangerGene table using extended GenePred # format. 2004-05-11, hartera. Extended format frame information does not # look correct. Reload without the extended fields. ldHgGene ce3 sangerGene *.gff # Read 23277 transcripts in 431929 lines in 7 files # 23277 groups 7 seqs 8 sources 10 feature types #23277 gene predictions XXXXX Stopped here 2005-05-02 14:30 # Add proteinID field to sangerGene table, used by Gene Sorter ssh hgwdev cd /cluster/data/ce3/bed/sangerGene hgsql -e 'alter table sangerGene add proteinID varchar(40) NOT NULL;' ce3 # To add index on this column hgsql -e 'alter table sangerGene add index(proteinID);' ce3 # Add Swiss-Prot protein IDs to this column # There are 23277 entries in the sangerGene table: hgsql -N -e "select count(*) from sangerGene;" ce3 # 23277 # And 22086 names in common between sangerGene and sangerLinks: hgsql -N -e 'select count(*) from sangerGene as g,sangerLinks as l \ where g.name = l.orfName;' ce3 # 22086 # get names from sangerGene and sangerLinks tables hgsql -N -e "select name from sangerGene;" ce3 | sort > name.sangerGene hgsql -N -e "select orfName from sangerLinks;" ce3 | sort > orfName.sangerLinks # get list of names in sangerGene not in sangerLinks comm -23 name.sangerGene orfName.sangerLinks > geneNames.notin.sangerLinks # split this file into 5 pieces, 400 lines each: split -l 300 geneNames.notin.sangerLinks # take each of those lists and paste those identifiers into # the wormbase batch_genes query, # http://ws140.wormbase.org/db/searches/batch_genes # requesting the Swiss Prot ID # You can cut and paste by viewing the 400 line file # in your WEB browser as a text document. # only a couple of them are identified as having SP IDs: # cut and paste the results from the "Plain Text" output format F15E11.13 Q9N565 SWALL:Q9N565_CAEEL Y19D10A.13 SWALL:Q9UAU0_CAEEL Y19D10A.15 SWALL:Q9UAT7_CAEEL Y19D10A.9 SWALL:O44596_CAEEL Y45G12C.7 SWALL:Q95X13_CAEEL Y45G12C.9 SWALL:Q9N4Y6_CAEEL Y49C4A.2 ; SWALL:Q8MYQ4_CAEEL F25B5.4 Y49C4A.3 Y49C4A.3 ; SWALL:Q8MYQ4_CAEEL Y6B3B.12 Q5ZEQ2 ZK131.10 P09588 # CREATE SANGERPEP TABLE (DONE, 2005-08-15, hiram) # Must be done after sangerGene table load to have time stamps # correct for joinerCheck mkdir -p /cluster/data/ce3/bed/sangerPep cd /cluster/data/ce3/bed/sangerPep # Download peptide sequences from the Sanger Centre ftp site: wget -o ce3.fetch.log -r -l1 --no-directories --timestamping \ ftp://ftp.sanger.ac.uk/pub/databases/wormpep/wormpep140/wormpep140 # Load into database hgPepPred ce3 generic sangerPep wormpep140 # the sangerPep table is used by the sangerGene track gzip wormpep140 ssh kkr1u00 cd /iscratch/i/worms/ce3 zcat /cluster/data/ce3/bed/sangerPep/wormpep140.gz > wormpep140 mkdir splitPep faSplit sequence wormpep140 1000 splitPep/wp # rsync for i in 2 3 4 5 6 7 8 do rsync -a --progress --delete --rsh=rsh /iscratch/i/worms/ \ kkr${i}u00:/iscratch/i/worms/ done # CREATE ORFTOGENE TABLE (Used by hgGene and hgNear) (NOT DONE) mkdir /cluster/data/ce3/bed/orfToGene cd !$ # gene_names.txt for WS120 was created and provided by todd.harris@cshl.edu # ORF e.g. Y110A7A.10 gene e.g. aap-1 awk -F '\t' '$2 == "Caenorhabditis elegans" && $3 == "Gene" {printf("%s\t%s\n", $4, $1)}' gene_names.txt > orfToGene.txt # reformat this for use in creating Sanger Links with hgWormLinks awk 'NF == 2' orfToGene.txt > orfToGene.txt2 # use perl script to move ORFs with the same gene name onto separate lines # and remove lines where there is a gene name with an alternate name # in parentheses. this output file is used for Sanger Links /cluster/data/ce3/jkStuff/formatorfToGene.pl < orfToGene.txt2 > orfToGene.tab2 hgCeOrfToGene ce3 gene_names.txt sangerGene orfToGene ######################################################################### # BLASTZ HUMAN Cb1 (WORKING - 2005-05-02 Hiram) # going to try and do this with ce3 chroms vs the cb1 contigs, and # then lift to chrUn ssh eieio mkdir /cluster/data/ce3/bed/blastzCb1.2005_05_02 cd /cluster/data/ce3/bed ln -s blastzCb1.2005_05_02 blastz.cb1 cd blastzCb1.2005_05_02 cat << '_EOF_' > DEF # briggsae vs. elegans export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386 ALIGN=blastz-run BLASTZ=blastz BLASTZ_H=2000 BLASTZ_ABRIDGE_REPEATS=0 # TARGET: elegans Ce3 SEQ1_DIR=/iscratch/i/worms/Celegans3/ce3.2bit SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: briggsae cb1 SEQ2_DIR=/iscratch/i/worms/Cbriggsae/cb1.2bit SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=30000000 SEQ2_LAP=0 BASE=/cluster/data/ce3/bed/blastzCb1.2005_05_02 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << keep emacs coloring happy cp /cluster/data/ce3/chrom.sizes ./S1.len sort -rn +1 /cluster/data/cb1/chrom.sizes > S2.len # establish a screen to control this job screen cd /cluster/data/ce3/bed/blastzCb1.2005_05_02 time /cluster/bin/scripts/doBlastzChainNet.pl `pwd`/DEF > \ blast.run.out 2>&1 & # detach from screen session: Ctrl-a Ctrl-d # to reattach to this screen session: ssh eieio screen -d -r # STARTED - 2005-03-17 21:25 # swap results to place ce3 alignments onto cb1 ssh eieio cd /cluster/data/ce3/bed/blastzCb1.2005_05_02 time /cluster/bin/scripts/doBlastzChainNet.pl -swap `pwd`/DEF > \ swap.run.out 2>&1 & # BLAT SERVER SETUP (DONE - 2005-06-02 - Hiram) hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES("ce3", "blat15", 17784, 1, 0);' \ -h localhost hgcentraltest hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES("ce3", "blat15", 17785, 0, 1);' \ -h localhost hgcentraltest ########################################################################## # Convenient rsync for kkr1u00 to copy the worm stuff quickly ssh kkr1u00 for i in 2 3 4 5 6 7 8 do rsync -a --progress --delete --rsh=rsh /iscratch/i/worms/ \ kkr${i}u00:/iscratch/i/worms/ done ########################################################################## # BLASTZ C. elegans ce3 (DONE - 2005-08-10 - Hiram) ssh kk mkdir /cluster/data/ce3/bed/blastzCb2.2005_08_10 cd /cluster/data/ce3/bed/blastzCb2.2005_08_10 cat << '_EOF_' > DEF # Ce3 vs Cb2 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin ALIGN=blastz-run.v7 BLASTZ=blastz.v7 BLASTZ_H=2000 BLASTZ_ABRIDGE_REPEATS=0 # TARGET: Cb3 SEQ1_DIR=/iscratch/i/worms/ce3/nib SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Cb2 - full chroms only, no randoms SEQ2_DIR=/iscratch/i/worms/cb2/chromNib SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/ce3/bed/blastzCb2.2005_08_10 SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << keep emacs coloring happy cp -p /cluster/data/ce3/chrom.sizes ./S1.len nibSize /iscratch/i/worms/cb2/chromNib/*.nib | \ awk '{printf "%s\t%s\n", $2, $3}' | sort -rn +1 > S2.len # establish a screen to control this job screen time /cluster/bin/scripts/doBlastzChainNet.pl -stop chainMerge \ `pwd`/DEF > blast.run.out 2>&1 & # real 85m16.486s # user 0m1.070s # sys 0m0.200s # detach from screen session: Ctrl-a Ctrl-d # to reattach to this screen session: ssh kk screen -d -r # STARTED - 2005-08-10 16:22 # FINISHED - 2005-08-10 17:45 # blastz kluster run # Completed: 182 of 182 jobs # CPU time in finished jobs: 109157s 1819.29m 30.32h 1.26d 0.003 y # IO & Wait Time: 1038s 17.30m 0.29h 0.01d 0.000 y # Average job time: 605s 10.09m 0.17h 0.01d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 4811s 80.18m 1.34h 0.06d # Submission to last job: 4813s 80.22m 1.34h 0.06d # doCatRun # Completed: 14 of 14 jobs # CPU time in finished jobs: 18s 0.29m 0.00h 0.00d 0.000 y # IO & Wait Time: 54s 0.91m 0.02h 0.00d 0.000 y # Average job time: 5s 0.09m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 10s 0.17m 0.00h 0.00d # Submission to last job: 10s 0.17m 0.00h 0.00d # doChainRun # Completed: 7 of 7 jobs # CPU time in finished jobs: 125s 2.09m 0.03h 0.00d 0.000 y # IO & Wait Time: 41s 0.68m 0.01h 0.00d 0.000 y # Average job time: 24s 0.40m 0.01h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 42s 0.70m 0.01h 0.00d # Submission to last job: 42s 0.70m 0.01h 0.00d # And then, a run with the randomContigs mkdir /cluster/data/ce3/bed/blastzCb2.2005_08_10/randomContigs cd /cluster/data/ce3/bed/blastzCb2.2005_08_10/randomContigs cat << '_EOF_' > DEF # Ce3 vs Cb2 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin ALIGN=blastz-run.v7 BLASTZ=blastz.v7 BLASTZ_H=2000 BLASTZ_ABRIDGE_REPEATS=0 # TARGET: Cb3 SEQ1_DIR=/iscratch/i/worms/ce3/nib SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Cb2 - full chroms only, no randoms SEQ2_DIR=/iscratch/i/worms/cb2/randomContigs.2bit SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/ce3/bed/blastzCb2.2005_08_10/randomContigs SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << keep emacs coloring happy cp -p /cluster/data/ce3/chrom.sizes ./S1.len twoBitInfo /iscratch/i/worms/cb2/randomContigs.2bit stdout | \ sort -rn +1 > S2.len # establish a screen to control this job screen time /cluster/bin/scripts/doBlastzChainNet.pl -stop chainMerge \ `pwd`/DEF > blast.run.out 2>&1 & # real 104m9.081s # user 0m1.180s # sys 0m0.150s # detach from screen session: Ctrl-a Ctrl-d # to reattach to this screen session: ssh kk screen -d -r # blastz kluster run # Completed: 28 of 28 jobs # CPU time in finished jobs: 36050s 600.84m 10.01h 0.42d 0.001 y # IO & Wait Time: 259s 4.31m 0.07h 0.00d 0.000 y # Average job time: 1297s 21.61m 0.36h 0.02d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 6144s 102.40m 1.71h 0.07d # Submission to last job: 6144s 102.40m 1.71h 0.07d # doCatRun # Completed: 14 of 14 jobs # CPU time in finished jobs: 5s 0.08m 0.00h 0.00d 0.000 y # IO & Wait Time: 43s 0.72m 0.01h 0.00d 0.000 y # Average job time: 3s 0.06m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 5s 0.08m 0.00h 0.00d # Submission to last job: 7s 0.12m 0.00h 0.00d # doChainRun # Completed: 7 of 7 jobs # CPU time in finished jobs: 29s 0.48m 0.01h 0.00d 0.000 y # IO & Wait Time: 25s 0.42m 0.01h 0.00d 0.000 y # Average job time: 8s 0.13m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 16s 0.27m 0.00h 0.00d # Submission to last job: 16s 0.27m 0.00h 0.00d # Now, lift those results to their chr*_random coordinates: ssh kkstore02 cd /cluster/data/ce3/bed/blastzCb2.2005_08_10/randomContigs/axtChain mkdir liftedChain for F in chain/*.chain do C=${F#chain/} C=${C/.chain/} echo "$F -> liftedChain/$C" liftUp -chainQ liftedChain/${C}.lifted.chain \ /cluster/bluearc/scratch/hg/cb2/liftRandoms.lft warn $F done # now merge these lifted chain files with the existing chain files for # the chroms and then sort by score using chainSort cd /cluster/data/ce3/bed/blastzCb2.2005_08_10/axtChain # gzipped file is only chains for chromsI-X and chrM so rename mv ce3.cb2.all.chain.gz ce3.cb2.chroms.chain.gz mv chain chromChain mkdir chain chainUnSorted # get all chains to be merged in chainUnSorted dir cp -p ./chromChain/*.chain ./chainUnSorted/ # copy contig chains, these are *.lifted.chain so they do not # write over the chrom chains. cp -p ../randomContigs/axtChain/liftedChain/*.chain ./chainUnSorted/ # then merge and sort all these chains. they must be merged and all # sorted together so that all IDs are unique across all chroms. # IDs are reassigned by chainMergeSort so that IDs are unique. chainMergeSort chainUnSorted/*.chain | gzip > ce3.cb2.all.chain.gz # use chainSplit to split this into chains again chainSplit chain ce3.cb2.all.chain.gz # then pick up the doBlastzChainNet.pl script with the net step ssh kk cd /cluster/data/ce3/bed/blastzCb2.2005_08_10 cp -p DEF DEF.chroms # edit DEF so SEQ2_DIR=/iscratch/i/worms/cb2/allNib # as need all nib files now # And complete S2.len nibSize /iscratch/i/worms/cb2/nib/*.nib | \ awk '{printf "%s\t%s\n", $2, $3}' | sort -rn +1 > S2.len # establish a screen to control this job screen time /cluster/bin/scripts/doBlastzChainNet.pl -continue net \ `pwd`/DEF > net.run.out 2>&1 & # real 5m32.249s # user 0m0.510s # sys 0m0.090s # detach from screen session: Ctrl-a Ctrl-d # to reattach to this screen session: ssh kk screen -d -r # swap results to place ce3 alignments onto cb2 time /cluster/bin/scripts/doBlastzChainNet.pl -swap `pwd`/DEF > \ swap.run.out 2>&1 & # real 7m27.028s # user 0m1.770s # sys 0m0.190s ########################################################################## # BLASTZ C. remanei caeRem1 (DONE - 2005-08-18 - Hiram) ssh pk mkdir /cluster/data/ce3/bed/blastzCaeRem1.2005_08_18 cd /cluster/data/ce3/bed/blastzCaeRem1.2005_08_18 cat << '_EOF_' > DEF # Ce3 vs Cb2 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin ALIGN=blastz-run.v7 BLASTZ=blastz.v7 BLASTZ_H=2000 BLASTZ_ABRIDGE_REPEATS=0 # TARGET: Cb3 SEQ1_DIR=/san/sanvol1/scratch/worms/ce3/nib SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: CaeRem1 SEQ2_DIR=/san/sanvol1/scratch/worms/caeRem1/caeRem1.2bit SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/cluster/data/ce3/bed/blastzCaeRem1.2005_08_18 SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << keep emacs coloring happy cp -p /cluster/data/ce3/chrom.sizes ./S1.len twoBitInfo /san/sanvol1/scratch/worms/caeRem1/caeRem1.2bit stdout | \ sort -rn +1 > S2.len # establish a screen to control this job # altered local copy of doBlastzChainNet.pl to fix santest location # and run everything on pk since the san location is unique here # and these jobs are not going to be very large anyway screen time ./doBlastzChainNet.pl \ -bigClusterHub=pk \ -smallClusterHub=pk \ -workhorse=pk \ -fileServer=kkstore02 \ `pwd`/DEF > blast.run.out 2>&1 & # real 33m25.177s # user 0m0.561s # sys 0m0.222s # detach from screen session: Ctrl-a Ctrl-d # to reattach to this screen session: ssh kk screen -d -r # swap results to place ce3 alignments onto caeRem1 cd /cluster/data/ce3/bed/blastzCaeRem1.2005_08_18 time ./doBlastzChainNet.pl -swap \ -bigClusterHub=pk \ -smallClusterHub=pk \ -workhorse=pk \ -fileServer=kkstore02 \ `pwd`/DEF > swap.run.out 2>&1 & # real 17m13.957s # user 0m0.395s # sys 0m0.148s ######################################################################## # Aceview (Acembly) Gene set: (DONE - 2005-09-07 - Hiram) # Data is obtained from # Danielle et Jean Thierry-Mieg mieg@ncbi.nlm.nih.gov ssh hgwdev mkdir /cluster/data/ce3/bed/aceview cd /cluster/data/ce3/bed/aceview wget --timestamping \ 'ftp://anonymous:ucsc@ftp.ncbi.nlm.nih.gov/repository/acedb/wormgenes/aceview.wormgenes.2005_09_06.gff.gz' # Verify no reversed blocks zcat aceview.wormgenes.2005_09_06.gff.gz \ | awk -F"\t" '{if ($4 > $5) {printf "reverse blocks problem at:\n$0" } }' zcat aceview.wormgenes.2005_09_06.gff.gz | \ sed -e "s/CHROMOSOME_/chr/; s/MtDNA/M/" \ | ldHgGene -gtf ce3 acembly stdin # Read 22456 transcripts in 412173 lines in 1 files # 22456 groups 7 seqs 1 sources 5 feature types # Check chrom names: checkTableCoords -table=acembly ce3 hgsql -N -e "select chrom from acembly;" ce3 | sort | uniq -c # 3539 chrI # 3836 chrII # 3491 chrIII # 3734 chrIV # 7 chrM # 4629 chrV # 3220 chrX featureBits ce3 acembly # 25509966 bases of 100291840 (25.436%) in intersection ############################################################################# # BLASTZ Cb2 (WORKING - 2005-09-09 - Hiram) ssh kk mkdir /cluster/data/ce3/bed/blastzCb2.2005-09-09 cd /cluster/data/ce3/bed ln -s blastzCb2.2005-09-09 blastz.cb2 cd blastzCb2.2005-09-09 # Utilizing tiny target chunks and a single query chunk to use the # dynamic masking available via the BLASTZ_M parameter cat << '_EOF_' > DEF # ce3 vs cb2 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin ALIGN=blastz-run.v7 BLASTZ=blastz.v7 BLASTZ_H=2000 BLASTZ_M=50 BLASTZ_ABRIDGE_REPEATS=0 # TARGET: elegans Ce3 SEQ1_DIR=/san/sanvol1/scratch/worms/ce3/nib SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=1000000 SEQ1_LAP=100 # QUERY: briggsae Cb2 SEQ2_DIR=/san/sanvol1/scratch/worms/cb2/chrRandomContigs.2bit SEQ2_LIFT=/san/sanvol1/scratch/worms/cb2/liftRandoms.lft SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=200000000 SEQ2_LAP=0 BASE=/cluster/data/ce3/bed/blastzCb2.2005-09-09 SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # happy emacs nibSize /san/sanvol1/scratch/worms/ce3/nib/*.nib | \ awk '{printf "%s\t%s\n", $2, $3}' | sort -rn +1 > S1.len twoBitInfo /san/sanvol1/scratch/worms/cb2/chrRandomContigs.2bit S2.len twoBitInfo /san/sanvol1/scratch/worms/cb2/cb2.2bit stdout \ | grep random >> S2.len twoBitInfo /san/sanvol1/scratch/worms/cb2/cb2.2bit stdout \ | grep chrUn >> S2.len XXXX - working on modifications to doBlastzChainNet.pl 2005-09-06 XXXX - to do this random contigs business properly # establish a screen to control this job screen time ./doBlastzChainNet.pl -stop chainMerge \ -bigClusterHub=kk \ `pwd`/DEF > toChainMerge.run.out 2>&1 & # STARTED - 2005-09-09 - 16:18 # with lots of kluster contention # Completed: 104 of 104 jobs # CPU time in finished jobs: 125426s 2090.44m 34.84h 1.45d 0.004 y # IO & Wait Time: 1821s 30.35m 0.51h 0.02d 0.000 y # Average job time: 1224s 20.39m 0.34h 0.01d # Longest finished job: 2849s 47.48m 0.79h 0.03d # Submission to last job: 18148s 302.47m 5.04h 0.21d time ./doBlastzChainNet.pl -continue cat -stop chainRun \ -bigClusterHub=kk \ `pwd`/DEF > toChainRun.run.out 2>&1 & # Completed: 104 of 104 jobs # CPU time in finished jobs: 31s 0.52m 0.01h 0.00d 0.000 y # IO & Wait Time: 351s 5.84m 0.10h 0.00d 0.000 y # Average job time: 4s 0.06m 0.00h 0.00d # Longest finished job: 12s 0.20m 0.00h 0.00d # Submission to last job: 211s 3.52m 0.06h 0.00d time ./doBlastzChainNet.pl -continue chainRun -stop chainMerge \ -bigClusterHub=kk \ `pwd`/DEF > toChainMerge.run.out 2>&1 & #Completed: 7 of 7 jobs #CPU time in finished jobs: 347s 5.78m 0.10h 0.00d 0.000 y #IO & Wait Time: 65s 1.08m 0.02h 0.00d 0.000 y #Average job time: 59s 0.98m 0.02h 0.00d #Longest running job: 0s 0.00m 0.00h 0.00d #Longest finished job: 120s 2.00m 0.03h 0.00d #Submission to last job: 696s 11.60m 0.19h 0.01d time ./doBlastzChainNet.pl -continue chainMerge -stop net \ -bigClusterHub=kk \ `pwd`/DEF > toNet.run.out 2>&1 & time ./doBlastzChainNet.pl -continue load -stop load \ -bigClusterHub=kk \ `pwd`/DEF > loadStep.run.out 2>&1 & time ./doBlastzChainNet.pl -continue download -stop cleanup \ -bigClusterHub=kk \ `pwd`/DEF > thruCleanup.run.out 2>&1 & # swap results to place ce3 alignments onto cb2 time ./doBlastzChainNet.pl -swap \ -bigClusterHub=kk \ `pwd`/DEF > \ swap.run.out 2>&1 & # detach from screen session: Ctrl-a Ctrl-d # to reattach to this screen session: ####################################################################### # 3-WAY Multiz (DONE - 2005-12-13 - Hiram) mkdir /cluster/data/ce3/bed/multiz3way cd /cluster/data/ce3/bed/multiz3way cat << '_EOF_' > 3way.nh ((ce3:0.2,caeRem1:0.2):0.3,cb2:0.3) '_EOF_' /cluster/bin/phast/draw_tree 3way.nh > 3way.ps convert 3way.ps 3way.jpg # featureBits chainLink measures # chainMm7Link chain linearGap # distance on Ce3 on other minScore # 1 0.3 - remanei caeRem1 (% 46.128) (% 37.675) 1000 loose # 2 0.2 - briggsae cb2 (% 42.047) (% 39.062) 1000 loose mkdir mafLinks mkdir mafLinks/caeRem1 mkdir mafLinks/ce3 cd mafLinks/caeRem1 ln -s ../../../blastzCaeRem1.2005_08_18/mafNet/*.maf.gz . cd ../ce3 ln -s ../../../blastzCb2.2005-09-09/mafNet/*.maf.gz . ssh pk mkdir -p /san/sanvol1/scratch/ce3/multiz3way cd /san/sanvol1/scratch/ce3/multiz3way rsync -a --progress --copy-links \ /cluster/data/ce3/bed/multiz3way/mafLinks/ . mkdir penn cp -p /cluster/bin/penn/v10.5.x86_64/multiz-tba/multiz penn cp -p /cluster/bin/penn/v10.5.x86_64/multiz-tba/maf_project penn cd /cluster/data/ce3/bed/multiz3way mkdir -p maf mkdir -p run cd run # create scripts to run var_multiz on cluster cat > oneMultiz.csh << 'EOF' #!/bin/csh -fe set c = $1 set db = ce3 set multi = /scratch/tmp/$db/multiz3way.$c set pairs = /san/sanvol1/scratch/$db/multiz3way set penn = $pairs/penn # special mode -- # with 1 arg, cleanup if ($#argv == 1) then echo "cleanup" echo "rm -fr $multi" rm -fr $multi echo "rmdir --ignore-fail-on-non-empty /scratch/tmp/$db" rmdir --ignore-fail-on-non-empty /scratch/tmp/$db exit endif # special mode -- # with 3 args, saves an alignment file if ($#argv == 3) then echo "cp $multi/$2/$c.maf $3" ls -og $multi/$2/$c.maf cp $multi/$2/$c.maf $3 exit endif set s1 = $2 set s2 = $3 set flag = $4 # locate input files -- in pairwise dir, or multiple dir set d1 = $multi set d2 = $multi if (-d $pairs/$s1) then set d1 = $pairs set f1 = $d1/$s1/$c.maf.gz set t1 = /tmp/$s1.$c.maf zcat $f1 > $t1 else set f1 = $d1/$s1/$c.maf set t1 = /tmp/$s1.$c.maf cp -p $f1 $t1 endif if (-d $pairs/$s2) then set d2 = $pairs set f2 = $d2/$s2/$c.maf.gz set t2 = /tmp/$s2.$c.maf zcat $f2 > $t2 else set f2 = $d2/$s2/$c.maf set t2 = /tmp/$s2.$c.maf cp -p $f2 $t2 endif # write to output dir set out = $multi/${s1}${s2} mkdir -p $out # check for empty input file if (-s $t1 && -s $t2) then echo "Aligning $f1 $f2 $flag" $penn/multiz $t1 $t2 $flag $out/$c.unused1.maf \ $out/$c.unused2.maf > $out/$c.full.maf cat $out/$c.full.maf $out/$c.unused1.maf $out/$c.unused2.maf > \ $out/$c.tmp.maf echo "Ordering $c.maf" $penn/maf_project $out/$c.tmp.maf $db.$c > $out/$c.maf rm -f $t1 $t2 else if (-s $t1) then cp -p $t1 $out/$c.maf rm -f $t1 else if (-s $t2) then cp -p $t2 $out/$c.maf rm -f $t2 endif 'EOF' # << keep emacs coloring happy chmod +x oneMultiz.csh cp -p oneMultiz.csh /san/sanvol1/scratch/ce3/multiz3way/penn/oneMultiz.csh # using your tree diagram printed above, arrange these alignments # in order of the tree branches cat > allMultiz.csh << 'EOF' #!/bin/csh -fe # multiple alignment steps: set c = $1 set db = ce3 set s = "/san/sanvol1/scratch/$db/multiz3way/penn/oneMultiz.csh" $s $c cb2 caeRem1 0 # get final alignment file $s $c cb2caeRem1 /cluster/data/$db/bed/multiz3way/maf/$c.maf #cleanup $s $c 'EOF' # happy emacs chmod +x allMultiz.csh cat << 'EOF' > template #LOOP ./allMultiz.csh $(root1) {check out line+ /cluster/data/ce3/bed/multiz3way/maf/$(root1).maf} #ENDLOOP 'EOF' awk '{print $1}' ../../../chrom.sizes > chrom.lst gensub2 chrom.lst single template jobList para create jobList para try; para check ... etc para time # Completed: 7 of 7 jobs # CPU time in finished jobs: 836s 13.93m 0.23h 0.01d 0.000 y # IO & Wait Time: 30s 0.50m 0.01h 0.00d 0.000 y # Average job time: 124s 2.06m 0.03h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 184s 3.07m 0.05h 0.00d # Submission to last job: 184s 3.07m 0.05h 0.00d ssh kkstore02 -d /cluster/data/ce3/bed/multiz3way time catDir maf > multiz3way.maf ls -og # -rw-rw-r-- 1 195770329 Dec 13 12:03 multiz3way.maf # Create per-chrom individual maf files for downloads mkdir mafDownloads for M in maf/chr*.maf do B=`basename $M` cp -p ${M} mafDownloads/${B} gzip mafDownloads/${B} echo ${B} done done ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/ce3/multiz3way cd /usr/local/apache/htdocs/goldenPath/ce3/multiz3way ln -s /cluster/data/ce3/bed/multiz3way/mafDownloads/chr*.maf.gz . # Load into database ssh hgwdev cd /cluster/data/ce3/bed/multiz3way mkdir /gbdb/ce3/multiz3way ln -s /cluster/data/ce3/bed/multiz3way/multiz3way.maf \ /gbdb/ce3/multiz3way time hgLoadMaf ce3 multiz3way # Loaded 203120 mafs in 1 files from /gbdb/ce3/multiz3way # real 0m30.526s time hgLoadMafSummary -minSize=10000 -mergeGap=500 -maxSize=50000 ce3 \ multiz3waySummary multiz3way.maf # Processed 305166 components in 203120 mafs from multiz3way.maf # real 0m26.164s ####################################################################### # phastCons conservation scores on the 3-way # DONE - 2005-12-13 - 2005-12-14 - Hiram ssh kkstore02 mkdir /cluster/data/ce3/bed/multiz3way/cons cd /cluster/data/ce3/bed/multiz3way/cons /cluster/bin/phast/$MACHTYPE/msa_split ../maf/chrV.maf \ --refseq ../../../softMask/chrV.fa --in-format MAF \ --windows 100000000,1000 --out-format SS \ --between-blocks 5000 --out-root s1 /cluster/bin/phast/$MACHTYPE/phyloFit -i SS s1.*.ss \ --tree "(ce3,(caeRem1,cb2))" \ --out-root starting-tree rm s1.*.ss # add up the C and G: grep BACKGROUND starting-tree.mod | awk '{printf "%0.3f\n", $3 + $4;}' # 0.372 # This 0.372 is used in the --gc argument below # Create big bad bloated SS files on san filesystem ssh kkstore02 mkdir -p /san/sanvol1/scratch/ce3/cons/ss cd /san/sanvol1/scratch/ce3/cons/ss for C in `awk '{print $1}' /cluster/data/ce3/chrom.sizes` do if [ -s /cluster/data/ce3/bed/multiz3way/maf/${C}.maf ]; then mkdir ${C} echo msa_split $C chrN=${C/chr/} chrN=${chrN/_random/} /cluster/bin/phast/$MACHTYPE/msa_split \ /cluster/data/ce3/bed/multiz3way/maf/${C}.maf \ --refseq /cluster/data/ce3/softMask/${C}.fa \ --in-format MAF --windows 1000000,0 --between-blocks 5000 \ --out-format SS --out-root ${C}/${C} fi done # Create a random list of 50 1 mb regions (do not use the _randoms) cd /san/sanvol1/scratch/ce3/cons/ss ls -1l chr*/chr*.ss | grep -v random | \ awk '$5 > 3000000 {print $9;}' | randomLines stdin 50 ../randomSs.list # Set up parasol directory to calculate trees on these 50 regions ssh pk mkdir /san/sanvol1/scratch/ce3/cons/treeRun4 cd /san/sanvol1/scratch/ce3/cons/treeRun4 mkdir tree log # Tuning this loop should come back to here to recalculate # Create script that calls phastCons with right arguments # We are tuning target-coverage and expected-length cat > makeTree.csh << '_EOF_' #!/bin/csh -fe set C=$1:h mkdir -p log/${C} tree/${C} /cluster/bin/phast/$MACHTYPE/phastCons ../ss/$1 \ /cluster/data/ce3/bed/multiz3way/cons/starting-tree.mod \ --gc 0.372 --nrates 1,1 --no-post-probs --ignore-missing \ --expected-length 27 --target-coverage 0.45 \ --quiet --log log/$1 --estimate-trees tree/$1 '_EOF_' # emacs happy chmod a+x makeTree.csh # Create gensub file cat > template << '_EOF_' #LOOP makeTree.csh $(path1) #ENDLOOP '_EOF_' # happy emacs # Make cluster job and run it gensub2 ../randomSs.list single template jobList para create jobList para try/push/check/etc # Completed: 50 of 50 jobs # CPU time in finished jobs: 919s 15.31m 0.26h 0.01d 0.000 y # IO & Wait Time: 197s 3.29m 0.05h 0.00d 0.000 y # Average job time: 22s 0.37m 0.01h 0.00d # Longest finished job: 37s 0.62m 0.01h 0.00d # Now combine parameter estimates. We can average the .mod files # using phyloBoot. This must be done separately for the conserved # and nonconserved models ssh kkstore02 cd /san/sanvol1/scratch/ce3/cons/treeRun4 ls -1 tree/chr*/*.cons.mod > cons.list /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*cons.list' \ --output-average ave.cons.mod > cons_summary.txt 2>&1 ls -1 tree/chr*/*.noncons.mod > noncons.list /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*noncons.list' \ --output-average ave.noncons.mod > noncons_summary.txt cd .. cp -p ave.*.mod /cluster/data/ce3/bed/multiz3way/cons # measuring entropy # consEntopy # ave.cons.mod ave.noncons.mod --NH 9.78 # never stops with the --NH argument /cluster/bin/phast/$MACHTYPE/consEntropy .45 27 \ ave.cons.mod ave.noncons.mod # aiming for PIT=10 bits # With a PIT of 10 bits at 27 and .45 we will use these numbers # for the full run # at --expected-length 27 --target-coverage 0.45 # Transition parameters: gamma=0.450000, omega=27.000000, mu=0.037037, # nu=0.030303 # Relative entropy: H=0.326190 bits/site # Expected min. length: L_min=30.684329 sites # Expected max. length: L_max=24.341047 sites # Phylogenetic information threshold: PIT=L_min*H=10.008930 bits # at --expected-length 22 --target-coverage 0.45 # Transition parameters: gamma=0.450000, omega=22.000000, mu=0.045455, # nu=0.037190 # Relative entropy: H=0.343713 bits/site # Expected min. length: L_min=27.428995 sites # Expected max. length: L_max=21.416559 sites # Phylogenetic information threshold: PIT=L_min*H=9.427716 bits # at --expected-length 25 --target-coverage 0.45 # Transition parameters: gamma=0.450000, omega=25.000000, mu=0.040000, # nu=0.032727 # Relative entropy: H=0.332608 bits/site # Expected min. length: L_min=29.436557 sites # Expected max. length: L_max=23.218502 sites # Phylogenetic information threshold: PIT=L_min*H=9.790834 bits # at --expected-length 12 --target-coverage 0.17 # Transition parameters: gamma=0.170000, omega=12.000000, mu=0.083333, # nu=0.017068 # Relative entropy: H=0.363555 bits/site # Expected min. length: L_min=35.407009 sites # Expected max. length: L_max=16.168434 sites # Phylogenetic information threshold: PIT=L_min*H=12.872407 bits ssh pk # Create cluster dir to do main phastCons run mkdir /san/sanvol1/scratch/ce3/cons/consRun1 cd /san/sanvol1/scratch/ce3/cons/consRun1 mkdir ppRaw bed cp -p /san/sanvol1/scratch/ce3/cons/treeRun4/ave.*.mod . # Create script to run phastCons with right parameters # This job is I/O intensive in its output files, thus it is all # working over in /scratch/tmp/ cat > doPhast.csh << '_EOF_' #!/bin/csh -fe mkdir /scratch/tmp/${2} cp -p ../ss/${1}/${2}.ss ave.*.mod /scratch/tmp/${2} pushd /scratch/tmp/${2} > /dev/null /cluster/bin/phast/${MACHTYPE}/phastCons ${2}.ss ave.cons.mod,ave.noncons.mod \ --expected-length 27 --target-coverage 0.45 --quiet \ --seqname ${1} --idpref ${1} --viterbi ${2}.bed --score > ${2}.pp popd > /dev/null mkdir -p ppRaw/${1} mkdir -p bed/${1} mv /scratch/tmp/${2}/${2}.pp ppRaw/${1} mv /scratch/tmp/${2}/${2}.bed bed/${1} rm /scratch/tmp/${2}/ave.*.mod rm /scratch/tmp/${2}/${2}.ss rmdir /scratch/tmp/${2} '_EOF_' # emacs happy chmod a+x doPhast.csh # root1 == chrom name, file1 == ss file name without .ss suffix # Create gsub file cat > template << '_EOF_' #LOOP doPhast.csh $(root1) $(file1) #ENDLOOP '_EOF_' # happy emacs # Create parasol batch and run it ls -1 ../ss/chr*/chr*.ss | sed 's/.ss$//' > in.list gensub2 in.list single template jobList para create jobList para try/check/push/etc. # These jobs are very fast and very I/O intensive, even on the san # they can hang it up as they work at full tilt. # Completed: 104 of 104 jobs # CPU time in finished jobs: 331s 5.51m 0.09h 0.00d 0.000 y # IO & Wait Time: 1066s 17.77m 0.30h 0.01d 0.000 y # Average job time: 13s 0.22m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 54s 0.90m 0.01h 0.00d # Submission to last job: 75s 1.25m 0.02h 0.00d # combine predictions and transform scores to be in 0-1000 interval ssh kkstore02 cd /san/sanvol1/scratch/ce3/cons/consRun1 # The sed's and the sort get the file names in chrom,start order find ./bed -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \ | sort -k7,7 -k9,9n \ | sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \ | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \ | /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed # ~ 1 minute cp -p mostConserved.bed /cluster/data/ce3/bed/multiz3way # Figure out how much is actually covered by the mostConserved data as so: # The 100291840 comes from the non-n genome size, # from faSize on all chroms: cd /cluster/data/ce3 faSize softMask/*.fa # 100291840 bases (0 N's 100291840 real 86891657 upper 13400183 lower) # in 7 sequences in 7 files cd /cluster/data/ce3/bed/multiz3way awk ' {sum+=$3-$2} END{printf "%% %.2f = 100.0*%d/100291840\n",100.0*sum/100291840,sum}' \ mostConserved.bed -target-coverage 0.45: % 22.53 = 100.0*22595226/100291840 # Load most conserved track into database ssh hgwdev cd /cluster/data/ce3/bed/multiz3way hgLoadBed -strict ce3 phastConsElements mostConserved.bed # Loaded 185737 elements of size 5 # ~1 minute load time featureBits ce3 -enrichment sangerGene:cds phastConsElements # -target-coverage 0.45 and expected lengths 27: # sangerGene:cds 25.226%, phastConsElements 22.529%, both 13.627%, # cover 54.02%, enrich 2.40x # Create merged posterier probability file and wiggle track data files # the sed business gets the names sorted by chromName, chromStart # so that everything goes in numerical order into wigEncode ssh kkstore02 cd /san/sanvol1/scratch/ce3/cons/consRun1 find ./ppRaw -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \ | sort -k7,7 -k9,9n \ | sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \ | wigEncode stdin phastCons3.wig phastCons3.wib # about 40 seconds for above # -rw-rw-r-- 1 54890327 Dec 14 12:34 phastCons3.wib # -rw-rw-r-- 1 9271833 Dec 14 12:34 phastCons3.wig cp -p phastCons3.wig phastCons3.wib /cluster/data/ce3/bed/multiz3way # Load gbdb and database with wiggle. ssh hgwdev cd /cluster/data/ce3/bed/multiz3way ln -s `pwd`/phastCons3.wib /gbdb/ce3/wib/phastCons3.wib hgLoadWiggle ce3 phastCons3 phastCons3.wig # ~ 1.5 minute load # Create histogram to get an overview of all the data time hgWiggle -doHistogram \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ -db=ce3 phastCons3 > histogram.data 2>&1 # about 8 minutes to scan all data cat << '_EOF_' | gnuplot > histo.png set terminal png small color \ x000000 xffffff xc000ff x66ff66 xffff00 x00ffff xff0000 set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " C. elegans Ce3 Histogram phastCons3 track" set xlabel " phastCons3 score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # happy emacs display histo.png & # prepare compressed copy of ascii data values for downloads ssh pk cd /san/sanvol1/scratch/ce3/cons/consRun1 cat << '_EOF_' > gzipAscii.sh #!/bin/sh TOP=`pwd` export TOP mkdir -p phastCons3Scores for D in ppRaw/chr* do C=${D/ppRaw\/} out=phastCons3Scores/${C}.data.gz echo "========================== ${C} ${D}" find ./${D} -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \ | sort -k7,7 -k9,9n \ | sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat | gzip > ${out} done '_EOF_' # happy emacs chmod +x gzipAscii.sh time ./gzipAscii.sh # takes 1 minute, makes 95 Mb of data # copy them for downloads ssh kkstore02 mkdir /cluster/data/ce3/bed/multiz3way/phastCons3Scores cd /cluster/data/ce3/bed/multiz3way/phastCons3Scores rsync -a --progress \ pk:/san/sanvol1/scratch/ce3/cons/consRun1/phastCons3Scores/ . # 10 second copy ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/ce3/phastCons3Scores cd /usr/local/apache/htdocs/goldenPath/ce3/phastCons3Scores ln -s /cluster/data/ce3/bed/multiz3way/phastCons3Scores/*.gz . ########################################################################### # preparing downloads (WORKING - 2005-12-14 - Hiram) ssh kkstore02 mkdir /cluster/data/ce3/goldenPath mkdir /cluster/data/ce3/goldenPath/bigZips mkdir /cluster/data/ce3/goldenPath/chromosomes cd /cluster/data/ce3/ cp -p softMask/chr*.fa goldenPath/chromosomes gzip goldenPath/chromosomes/chr*.fa cat RMRun/chrI.fa.out > goldenPath/bigZips/rmsk.out tail +4 RMRun/chrII.fa.out >> goldenPath/bigZips/rmsk.out tail +4 RMRun/chrIII.fa.out >> goldenPath/bigZips/rmsk.out tail +4 RMRun/chrIV.fa.out >> goldenPath/bigZips/rmsk.out tail +4 RMRun/chrV.fa.out >> goldenPath/bigZips/rmsk.out tail +4 RMRun/chrX.fa.out >> goldenPath/bigZips/rmsk.out tail +4 RMRun/chrM.fa.out >> goldenPath/bigZips/rmsk.out gzip goldenPath/bigZips/rmsk.out cd softMask tar cvzf ../goldenPath/bigZips/chromFa.tar.gz ./chr*.fa ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/ce3/bigZips mkdir /usr/local/apache/htdocs/goldenPath/ce3/chromosomes cd /usr/local/apache/htdocs/goldenPath/ce3/bigZips ln -s /cluster/data/ce3/goldenPath/bigZips/* . cd /usr/local/apache/htdocs/goldenPath/ce3/chromosomes ln -s /cluster/data/ce3/goldenPath/chromosomes/chr*.fa.gz . ########################################################################### # MAKE 11.OOC FILE FOR BLAT and GENBANK (DONE - 2006-01-11 - hiram) # Use -repMatch=40 (based on size, for human use 1024) ssh kkr1u00 cd /iscratch/i/worms/Celegans3 ls -1 `pwd`/nib/chr*.nib > nib.lst time /cluster/bin/x86_64/blat nib.lst /dev/null /dev/null -tileSize=11 \ -makeOoc=11.ooc -repMatch=40 # Wrote 43651 overused 11-mers to 11.ooc # real 0m4.562s rm nib.lst # copy to other iservers for i in 2 3 4 5 6 7 8 do rsync -a --progress --delete --rsh=rsh /iscratch/i/worms/Celegans3/ \ kkr${i}u00:/iscratch/i/worms/Celegans3/ done ########################################################################### # ENABLE GENBANK UPDATE (DONE 2006-01-11 - Hiram) # add ce3 to the following three files and check them in. # The genbank.conf entry is a copy of the existing ce2 entry with # appropriate changes for pathnames, etc ... src/hg/makeDb/genbank/etc/align.dbs src/hg/makeDb/genbank/etc/hgwdev.dbs src/hg/makeDb/genbank/etc/genbank.conf # go to /cluster/data/genbank/etc and do cvs update on these three files. ssh kkstore02 cd /cluster/data/genbank nice ./bin/gbAlignStep -initial ce3 # logFile: var/build/logs/2006.01.11-11:30:41.ce3.initalign.log ssh hgwdev cd /cluster/data/genbank nice ./bin/gbDbLoadStep -initialLoad ce3 # var/dbload/hgwdev/logs/2006.01.11-12:24:14.dbload.log ########################################################################### # MAKE WORMBASE GENEFINDER TRACKS (DONE, 2006-01-11, hartera) # next machine ssh kkstore02 mkdir -p /cluster/data/ce3/bed/sangerGenefinder cd /cluster/data/ce3/bed/sangerGenefinder # the perl trims these files down to a reasonable size. As they are # they cause ldHgGene to hangup due to memory size. for C in I II III IV V X M do echo -n "${C} " zcat ../../WS140/chr${C}.gff.gz | \ sed -e "s/CHROMOSOME_III/chrIII/g" -e "s/CHROMOSOME_II/chrII/g" \ -e "s/CHROMOSOME_IV/chrIV/g" -e "s/CHROMOSOME_I/chrI/g" \ -e "s/CHROMOSOME_X/chrX/g" -e "s/CHROMOSOME_V/chrV/g" \ -e "s/CHROMOSOME_M/chrM/g" \ -e 's/Sequence "\(.*\)"$/\1/' -e 's/Transcript "\(.*\)"$/\1/' | \ perl -ne '@a = split; print if(($a[1] =~ m/Genefinder/i) && ($a[2] =~ m/intron|exon|cds|sequence|transcri/i));' | \ sed -e 's/CDS //g; s/"//g' > chr${C}.gff done # check file sizes, should be reasonable ls -ogrt total 25024 -rw-rw-r-- 1 3927756 Jan 11 15:49 chrI.gff -rw-rw-r-- 1 4205150 Jan 11 15:50 chrII.gff -rw-rw-r-- 1 3567022 Jan 11 15:50 chrIII.gff -rw-rw-r-- 1 4167290 Jan 11 15:51 chrIV.gff -rw-rw-r-- 1 5547317 Jan 11 15:51 chrV.gff -rw-rw-r-- 1 4165641 Jan 11 15:52 chrX.gff -rw-rw-r-- 1 0 Jan 11 15:52 chrM.gff # now load database with those transformed gff files ssh hgwdev cd /cluster/data/ce3/bed/sangerGenefinder time ldHgGene ce3 sangerGenefinder *.gff # Read 62263 transcripts in 401688 lines in 7 files # 62263 groups 6 seqs 1 sources 4 feature types # 21182 gene predictions # real 0m7.532s ######################################################################## # TWINSCAN GENE PREDICTIONS (DONE, 2006-01-11, hartera) # These are Iscan (new version of Twinscan) gene predictions # e-mailed Michael Brent at WUSTL: brent@cse.wustl.edu to obtain data # *if* it isn't already at the location below. If it has been # some time since the WS release, it is probably already there. ssh hgwdev mkdir /cluster/data/ce3/bed/twinscan cd /cluster/data/ce3/bed/twinscan for D in chr_gtf chr_ptx chr_tx do mkdir ${D} cd ${D} wget --timestamping --force-directories --directory-prefix=. \ --dont-remove-listing --recursive --level=1 --no-parent \ --no-host-directories --cut-dirs=6 \ "http://genes.cs.wustl.edu/predictions/worm/C_elegans/WS140/Twinscan/${D}/" cd .. done # clean up index files rm chr*/index* # There used to be a lot of chrom name fixups to these files, but # it appears with this set the names are already in our usual chrN # format rather than the previous CHROMOSOME_N format # Except for chrM sed -e "s/chrMtDNA/chrM/g" chr_gtf/CHROMOSOME_MtDNA.gtf \ > chr_gtf/CHROMOSOME_M.gtf mv chr_gtf/CHROMOSOME_MtDNA.gtf chr_gtf/CHROMOSOME_MtDNA.gtf.original sed -e "s/chrMtDNA/chrM/g" chr_ptx/CHROMOSOME_MtDNA.ptx \ > chr_ptx/CHROMOSOME_M.ptx mv chr_ptx/CHROMOSOME_MtDNA.ptx chr_ptx/CHROMOSOME_MtDNA.ptx.original # load data into database, need -gtf option as no stop codons in # these predictions. also use new -frame -id and -geneName options ldHgGene -gtf -genePredExt ce3 twinscan chr_gtf/CHROMOSOME_*.gtf # Read 21985 transcripts in 169772 lines in 7 files # 21985 groups 7 seqs 1 sources 3 feature types # 21985 gene predictions hgPepPred ce3 generic twinscanPep chr_ptx/CHROMOSOME_*.ptx ######################################################################## # MAKE ALT. SPLICING TRACK (DONE - 2006-01-12 - Hiram) # create rnaCluster table and Gene Bounds track ssh hgwdev mkdir /cluster/data/ce3/bed/rnaCluster cd /cluster/data/ce3/bed/rnaCluster for C in I II III IV V X do echo "clusterRna ce3 /dev/null chr${C}.bed -chrom=${C}" clusterRna ce3 /dev/null chr${C}.bed -chrom=chr${C} done # Are we getting about the same numbers as before ? # Do a wc on these bed files, should be something like: wc *.bed | sort -n # 2628 31536 193683 chrIII.bed # 2767 33204 194963 chrX.bed # 2878 34536 202197 chrI.bed # 3245 38940 235165 chrIV.bed # 3506 42072 251726 chrII.bed # 4981 59772 353344 chrV.bed # 20005 240060 1431078 total hgLoadBed -strict ce3 rnaCluster *.bed hgsql -N -e "select chrom from chromInfo" ce3 > chrom.list (perl -e 'print "#\!/bin/sh\n"; while(<>) {chomp($_); printf "hgsql -A -e =select * from rnaCluster where chrom like \"$_\" order by chromStart, name= ce3 | egrep -v chromStart | cut -f2\-13 > $_.ce3.rnaCluster.bed\n";}' | sed -e "s/=/'/g") < chrom.list > rnaCluster.sh chmod a+x rnaCluster.sh ./rnaCluster.sh # And the result of that should look like: wc *.rnaCluster.bed | sort -n # 0 0 0 chrM.ce3.rnaCluster.bed # 2628 31536 193683 chrIII.ce3.rnaCluster.bed # 2767 33204 194963 chrX.ce3.rnaCluster.bed # 2878 34536 202197 chrI.ce3.rnaCluster.bed # 3245 38940 235165 chrIV.ce3.rnaCluster.bed # 3506 42072 251726 chrII.ce3.rnaCluster.bed # 4981 59772 353344 chrV.ce3.rnaCluster.bed # 20005 240060 1431078 total mkdir /cluster/data/ce3/bed/altSplice cd /cluster/data/ce3/bed/altSplice hgsql -N -e "select chrom from chromInfo" ce3 > chrom.list perl -e 'while(<>) {chomp($_); print "altSplice -db=ce3 -beds=../rnaCluster/$_.ce3.rnaCluster.bed -agxOut=agx/$_.ce3.rnaCluster.agx -consensus -weightMrna\n";}' < chrom.list > altSplice.para.spec # these jobs are no big deal on the worm, just run this file: chmod +x altSplice.para.spec mkdir agx time ./altSplice.para.spec # real 6m55.219s # The result of that should look like: wc agx/*.agx | sort -n # 2579 46422 2379008 agx/chrIII.ce3.rnaCluster.agx # 2681 48258 2320195 agx/chrX.ce3.rnaCluster.agx # 2817 50706 2516155 agx/chrI.ce3.rnaCluster.agx # 3121 56178 2420981 agx/chrIV.ce3.rnaCluster.agx # 3388 60984 2454789 agx/chrII.ce3.rnaCluster.agx # 4873 87714 2939995 agx/chrV.ce3.rnaCluster.agx cat agx/*.agx > altSplice.agx agxToBed altSplice.agx altSplice.bed hgLoadBed -strict -notItemRgb \ -sqlTable=$HOME/kent/src/hg/lib/altGraphX.sql ce3 \ altGraphX altSplice.agx # Loaded 19459 elements of size 18 ######################################################################## # Create frames based page views of alt splice areas of interest # (DONE - 2005-01-12 - Hiram) # A link to this business was added to the description page # on each of the ce1 ce2 and ce3 browsers and all their # frames based viewpoints were placed in goldenPath to you can # click through from the description page ssh hgwdev mkdir /cluster/data/ce3/bed/altSplice/altAnalysis cd /cluster/data/ce3/bed/altSplice/altAnalysis altAnalysis ce3 ../altSplice.agx altSummary.txt links.html \ frame.html -bedName=all # creates files, (as well as several .bed files): # -rw-rw-r-- 1 282 May 16 09:19 frame.html # -rw-rw-r-- 1 391029 May 16 09:19 links.html # Copy these to the Intronerator WS120 location, change # default opening location: mkdir /usr/local/apache/htdocs/goldenPath/ce3/altGraphX sed -e "s#http://hgwdev-sugnet.cse.ucsc.edu##" links.html \ > /usr/local/apache/htdocs/goldenPath/ce3/altGraphX/links.html sed -e \ "s/Human Alt Splicing Conserved in Mouse/Alt Splicing in C. elegans/" \ -e "s/NM_005487/chrII:14689493-14690232/" \ -e "s#http://hgwdev-sugnet.cse.ucsc.edu##" \ frame.html \ > /usr/local/apache/htdocs/goldenPath/ce3/altGraphX/frame.html ######################################################################## # HGNEAR PROTEIN BLAST TABLES (DONE - 2005-01-12 - Hiram) ssh pk # Fetch WormBase peptides for this release mkdir /cluster/data/ce3/bed/blastp cd /cluster/data/ce3/bed/blastp wget --timestamping -O wormPep140.faa \ ftp://ftp.sanger.ac.uk/pub/databases/wormpep/wormpep140/wormpep140 mkdir /cluster/data/ce3/bed/hgNearBlastp cd /cluster/data/ce3/bed/hgNearBlastp cat << _EOF_ > config.ra # This elegans release vs. other Gene Sorter orgs: # self, human, mouse, rat, zebrafish, yeast targetGenesetPrefix wormBase targetDb ce3 queryDbs ce3 hg17 mm7 rn3 danRer3 sacCer1 ce3Fa /cluster/data/ce3/bed/blastp/wormPep140.faa hg17Fa /cluster/data/hg17/bed/blastp/known.faa mm7Fa /cluster/data/mm7/bed/geneSorter/blastp/known.faa rn3Fa /cluster/data/rn3/bed/blastp/known.faa danRer3Fa /cluster/data/danRer3/bed/blastp/ensembl.faa sacCer1Fa /cluster/data/sacCer1/bed/blastp/sgdPep.faa buildDir /cluster/data/ce3/bed/hgNearBlastp scratchDir /san/sanvol1/scratch/ce3HgNearBlastp _EOF_ doHgNearBlastp.pl config.ra >& do.log & tail -f do.log cat << _EOF_ > config.ra # This elegans release vs. other Gene Sorter orgs: # self, human, mouse, rat, zebrafish, yeast targetGenesetPrefix wormBase targetDb ce3 queryDbs ce3 ce3Fa /cluster/data/ce3/bed/blastp/wormPep140.faa buildDir /cluster/data/ce3/bed/hgNearBlastp scratchDir /san/sanvol1/scratch/ce3HgNearBlastp _EOF_ doHgNearBlastp.pl config.ra >& do.log & tail -f do.log ######################################################################### # Waba alignment with briggsae (WORKING - 2006-01-12 - Hiram) ssh pk # We need some unmasked FA for these things mkdir /san/sanvol1/scratch/worms/Celegans3/unmaskedFa cd /san/sanvol1/scratch/worms/Celegans3/unmaskedFa for C in I II III IV V X M do echo -n "chr${C}: " twoBitToFa -seq=chr${C} ../ce3.2bit stdout | \ tr [a-z] [A-Z] | sed -e "s/CHR${C}/chr${C}/" > chr${C}.fa faSize chr${C}.fa done mkdir /cluster/data/ce3/bed/cb2.waba cd /cluster/data/ce3/bed/cb2.waba for C in I II III IV V X I_random II_random III_random IV_random \ V_random X_random Un do ls -1S /san/sanvol1/scratch/worms/cb2/split/${C}/*.fa done > cb2.list ls -1S /san/sanvol1/scratch/worms/Celegans3/unmaskedFa/chr*.fa \ > elegans.list mkdir out cat elegans.list | while read FN do b=`basename ${FN}` mkdir out/${b%%.fa} done # create scripts to be used here cat << '_EOF_' > wabaRun #!/bin/csh -fe # # $1 - full pathname to a briggsae contig # $2 - file path to an elegans chrom.fa # $3 - result file full pathname # set f = $1:t set chr = $2:t set d = $chr:r mkdir -p /scratch/tmp/$d/$f cp $1 /scratch/tmp/$d/$f pushd . cd /scratch/tmp/$d/$f set t = $f:r /cluster/bin/x86_64/waba 1 $f $2 $t.1 /cluster/bin/x86_64/waba 2 $t.1 $t.2 /cluster/bin/x86_64/waba 3 $t.2 $t.wab popd cp -p /scratch/tmp/$d/$f/$t.wab $3 rm -f /scratch/tmp/$d/$f/$t.* rmdir --ignore-fail-on-non-empty /scratch/tmp/$d/$f rmdir --ignore-fail-on-non-empty /scratch/tmp/$d '_EOF_' # happy emacs chmod +x wabaRun cat << '_EOF_' > jobTemplate #LOOP ./wabaRun {check in exists+ $(path1)} {check in exists+ $(path2)} {check out exists out/$(root2)/$(root1).wab} #ENDLOOP '_EOF_' # happy emacs gensub2 cb2.list elegans.list jobTemplate jobList para create jobList para try para check para push # one of the jobs takes quite a while. Most of the others are OK: Completed: 6950 of 6951 jobs Crashed: 1 jobs CPU time in finished jobs: 19284472s 321407.87m 5356.80h 223.20d 0.612 y IO & Wait Time: 63556s 1059.26m 17.65h 0.74d 0.002 y Average job time: 2784s 46.40m 0.77h 0.03d Longest job: 47017s 783.62m 13.06h 0.54d Submission to last job: 58555s 975.92m 16.27h 0.68d # The failed job is: # /cluster/store5/worm/ce3/bed/waba/wabaRun \ # /iscratch/i/worms/Cbriggsae/contigs/c0907.fa \ # /iscratch/i/worms/Celegans2/unmaskedFa/chrI.fa \ # /cluster/store5/worm/ce3/bed/waba/out/chrI/c0907.wab XXXX - running 2004-04-07 - retrying this stand-along on kkr1u00 # after about 1.5 hours fails with the error: # waba: xenbig.c:550: mergeTwo: Assertion `c->qStart == qStart && # c->qEnd == qEnd' failed. # Abort # Ignoring that error, and moving on # next machine ssh hgwdev cd /cluster/data/ce3/bed/waba mkdir Load cd Load # The cat through the pipe to hgWaba will avoid making large files # that are not needed. cat << '_EOF_' > loadEm.sh #!/bin/sh # for c in I II III IV V X M do echo -n "${c} " cat /cluster/store/data/ce3/bed/waba/out/chr${c}/c*.wab | hgWaba ce3 Cbr chr${c} 0 stdin > proc${c}.out 2>&1 done exit 0 '_EOF_' chmod +x loadEm.sh # run it to load the waba track: ./loadEm.sh # remove garbage temp file: rm full_waba.tab chrom_waba.tab # worm/ce3/trackDb.ra entry: # track cbrWaba # shortLabel Briggsae Waba # longLabel C. briggsae Waba Alignments # group compGeno # priority 20 # visibility dense # color 140,0,200 # altColor 210,140,250