# for emacs: -*- mode: sh; -*- # This file describes how to make the browser database for the # worm C. elegans # 2004-03-30 [DONE, 2004-05-20, hartera] # NOTE: this doc may have genePred loads that fail to include # the bin column. Please correct that for the next build by adding # a bin column when you make any of these tables: # # mysql> SELECT tableName, type FROM trackDb WHERE type LIKE "%Pred%"; # +------------------+-------------------------+ # | tableName | type | # +------------------+-------------------------+ # | sangerGene | genePred sangerPep | # | sangerGenefinder | genePred | # | refGene | genePred refPep refMrna | # | twinscan | genePred twinscanPep | # | geneid | genePred geneidPep | # +------------------+-------------------------+ # DOWNLOAD SEQUENCE (DONE, 2004-03-31, hartera) # next machine ssh eieio mkdir -p /cluster/store5/worm/ce2/sanger mkdir -p /cluster/store5/worm/ce2/tmp ln -s /cluster/store5/worm/ce2 /cluster/data/ce2 cd /cluster/data/ce2/sanger wget -o ce2.fetch.log -r -l1 --no-directories \ ftp://ftp.sanger.ac.uk/pub/wormbase/WS120/CHROMOSOMES/ # This site is now at: # ftp://ftp.sanger.ac.uk/pub/wormbase/FROZEN_RELEASES/WS120/CHROMOSOMES/ # Takes about five minutes # This current_release is updated every two weeks. Although the # sequence is quite stable at this time and changes very little. # Check that you have some valid files. Should be chroms I II III IV V X # in dna, gff and agp formats, also Mitochondrial DNA, MtDNA in dna and # gff formats. ls -ogrt # Rename CHROMOSOME_MtDNA files to CHROMOSOME_M and change to # "CHROMOSOME_M" inside files chmod u+w CHROMOSOME_M* zcat CHROMOSOME_MtDNA.dna.gz | sed -e "s/CHROMOSOME_MtDNA/CHROMOSOME_M/g" \ | gzip > CHROMOSOME_M.dna.gz zcat CHROMOSOME_MtDNA.gff.gz | sed -e "s/CHROMOSOME_MtDNA/CHROMOSOME_M/g" \ | gzip > CHROMOSOME_M.gff.gz # remove old files rm CHROMOSOME_MtDNA* # CHROMOSOME_M sequence is identical to X54252.1 in GenBank # create a .agp file for chrM as there is none and hgGoldGapGl and other # programs require a .agp file so create CHROMOSOME_M.agp: # M 1 13794 1 F X54252.1 1 13794 + # translate to unzipped .fa, all upper case, and # rename the agp files so hgGoldGap can find them mkdir sangerFa foreach f (I II III IV V X M) echo -n "${f} " zcat sanger/CHROMOSOME_${f}.dna.gz | tr '[a-z]' '[A-Z]' | \ sed -e "s/CHROMOSOME_/chr/" > sangerFa/chr${f}.fa mkdir -p sangerFa/${f} ln -s /cluster/data/ce2/sanger/CHROMOSOME_${f}.agp sangerFa/${f}/chr${f}.agp end # hgGoldGap does not handle dir names over 2 characters: mv sangerFa/III sangerFa/3 # CREATING DATABASE (DONE, 2004-03-31 - hartera) # Create the database. # next machine ssh hgwdev echo 'create database ce2' | hgsql '' # if you need to delete that database: !!! WILL DELETE EVERYTHING !!! echo 'drop database ce2' | hgsql ce2 # Use df to ake sure there is at least 5 gig free on df -h /var/lib/mysql # Before loading data: # Filesystem Size Used Avail Use% Mounted on # /dev/sdc1 1.8T 280G 1.4T 17% /var/lib/mysql # CREATING GRP TABLE FOR TRACK GROUPING (DONE, 2004-03-31 - hartera) # next machine ssh hgwdev # the following command copies all the data from the table # grp in the database galGal2 to our new database ce2 echo "create table grp (PRIMARY KEY(NAME)) select * from galGal2.grp" \ | hgsql ce2 # if you need to delete that table: !!! WILL DELETE ALL grp data !!! echo 'drop table grp;' | hgsql ce2 # MAKE JKSTUFF AND BED DIRECTORIES (DONE, 2004-04-01, hartera) # This used to hold scripts -- better to keep them inline here so # they're in CVS. Now it should just hold lift file(s) and # temporary scripts made by copy-paste from this file. mkdir /cluster/data/ce2/jkStuff # This is where most tracks will be built: mkdir /cluster/data/ce2/bed # PREPARE Split contigs into 100,000 bp chunks for cluster runs # (DONE, 2004-04-01, hartera) # next machine ssh eieio cd /cluster/data/ce2 rm -fr ./split mkdir split foreach f (I II III IV V X M) mkdir split/$f faSplit size sangerFa/chr${f}.fa 100000 split/$f/c -lift=split/chr${f}.lft end 151 pieces of 151 written 153 pieces of 153 written 138 pieces of 138 written 175 pieces of 175 written 210 pieces of 210 written 178 pieces of 178 written 1 pieces of 1 written cat split/*.lft > liftAll.lft # copy them to /iscratch/i for cluster rsync # next machine ssh kkr1u00 cd /cluster/data/ce2/split foreach c (I II III IV V X M) echo -n "${c} " mkdir -p /iscratch/i/worms/Celegans2/unmaskedSplit/${c} cp -p ${c}/c*.fa /iscratch/i/worms/Celegans2/unmaskedSplit/${c} end iSync # Run RepeatMasker on the chromosomes (DONE, 2004-04-02 - hartera) # next machine ssh kk cd /cluster/data/ce2 # make run directory and job list, create the script to use # for the RepeatMasker run cat << '_EOF_' > jkStuff/RMWorm #!/bin/csh -fe # # This is a slight rearrangement of the # RMChicken script used in makeGalGal2.doc # The results here need to go to a different location # $1 == chrom name: I II III IV V X M # $2 == directory where split contig .fa is found # $3 == name of contig .fa file cd $1 pushd . cd $2 /bin/mkdir -p /tmp/ce2/$3/$1 /bin/cp $3 /tmp/ce2/$3/$1 cd /tmp/ce2/$3/$1 /cluster/bluearc/RepeatMasker/RepeatMasker -alignments -s -species elegans $3 popd /bin/cp /tmp/ce2/$3/$1/$3.out ./ if( -e /tmp/ce2/$3/$1/$3.align ) /bin/cp /tmp/ce2/$3/$1/$3.align ./ if (-e /tmp/ce2/$3/$1/$3.tbl) /bin/cp /tmp/ce2/$3/$1/$3.tbl ./ if (-e /tmp/ce2/$3/$1/$3.cat) /bin/cp /tmp/ce2/$3/$1/$3.cat ./ /bin/rm -r /tmp/ce2/$3/$1 /bin/rmdir --ignore-fail-on-non-empty /tmp/ce2/$3 /bin/rmdir --ignore-fail-on-non-empty /tmp/ce2 '_EOF_' # << this line makes emacs coloring happy chmod +x jkStuff/RMWorm # create job list mkdir RMRun rm -f RMRun/RMJobs foreach c (I II III IV V X M) mkdir /cluster/data/ce2/RMRun/${c} foreach t ( /iscratch/i/worms/Celegans2/unmaskedSplit/$c/c*.fa ) set d = $t:h set f = $t:t echo /cluster/data/ce2/jkStuff/RMWorm ${c} ${d} ${f} \ '{'check out line+ /cluster/data/ce2/RMRun/$c/${f}.out'}' \ >> RMRun/RMJobs end end # Do the run cd /cluster/data/ce2/RMRun para create RMJobs para try, para check, para check, para push, para check, ... para try: # para time # Checking finished jobs # Completed: 1006 of 1006 jobs # CPU time in finished jobs: 821747s 13695.78m 228.26h 9.51d 0.026 y # IO & Wait Time: 13643s 227.39m 3.79h 0.16d 0.000 y # Average job time: 830s 13.84m 0.23h 0.01d # Longest job: 935s 15.58m 0.26h 0.01d # Submission to last job: 5837s 97.28m 1.62h 0.07d # when they are finished, liftUp and load the .out files into the database: # next machine ssh eieio cd /cluster/data/ce2/RMRun foreach c (I II III IV V X M) liftUp chr${c}.fa.out /cluster/data/ce2/split/chr${c}.lft warn ${c}/*.fa.out end # next machine ssh hgwdev cd /cluster/data/ce2/RMRun hgLoadOut ce2 chr*.fa.out # Noticed one error in this load (reported to Robert Hubley): # Processing chrV.fa.out # Strange perc. field -0.1 line 2196 of chrV.fa.out # SIMPLE REPEAT [TRF] TRACK (DONE, hartera 2004-04-01) # ensure chr*.fa files exist on /iscratch/i # next machine ssh kkr1u00 mkdir -p /iscratch/i/worms/Celegans2/unmaskedFa cp -p /cluster/data/ce2/sangerFa/*.fa \ /iscratch/i/worms/Celegans2/unmaskedFa iSync # done iSync, # Create cluster parasol job: # next machine ssh kk mkdir -p /cluster/data/ce2/bed/simpleRepeat cd /cluster/data/ce2/bed/simpleRepeat mkdir trf ls -1S /iscratch/i/worms/Celegans2/unmaskedFa/chr*.fa > genome.lst cat << '_EOF_' > gsub #LOOP /cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf {check in line+ $(path1)} /dev/null -bedAt={check out line trf/$(root1).bed} -tempDir=/tmp #ENDLOOP '_EOF_' echo "" > dummy.lst gensub2 genome.lst dummy.lst gsub spec para create spec para try # there are only 7, so this runs them all para check # para time # Checking finished jobs # Completed: 7 of 7 jobs # CPU time in finished jobs: 3301s 55.01m 0.92h 0.04d 0.000 y # IO & Wait Time: 38s 0.64m 0.01h 0.00d 0.000 y # Average job time: 477s 7.95m 0.13h 0.01d # Longest job: 975s 16.25m 0.27h 0.01d # Submission to last job: 975s 16.25m 0.27h 0.01d # When cluster run is done, combine into one: cat trf/*.bed > simpleRepeat.bed # Load into the database: # next machine ssh hgwdev cd /cluster/data/ce2/bed/simpleRepeat hgLoadBed ce2 simpleRepeat simpleRepeat.bed \ -sqlTable=$HOME/src/hg/lib/simpleRepeat.sql # Loaded 28598 elements of size 16 # PROCESS SIMPLE REPEATS INTO MASK (DONE, 2004-04-02 - hartera) # After the simpleRepeats track has been built, make a filtered version # of the trf output: keep trf's with period <= 12: # next machine ssh eieio cd /cluster/data/ce2/bed/simpleRepeat mkdir -p trfMask foreach f (trf/*.bed) awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t end # create Soft and Hard masks from RepeatMaster and TRF outputs: # and rebuild the nib files using the soft masking in the fa: # next machine ssh eieio cd /cluster/data/ce2 mkdir softMask mkdir nib cd /cluster/data/ce2 foreach c (I II III IV V X M) echo -n "masking chr${c} " maskOutFa sangerFa/chr${c}.fa RMRun/chr${c}.fa.out \ softMask/chr${c}.fa -soft maskOutFa softMask/chr${c}.fa \ bed/simpleRepeat/trfMask/chr${c}.bed \ softMask/chr${c}.fa -softAdd faToNib -softMask softMask/chr${c}.fa nib/chr${c}.nib end # output: # masking chrI Writing 15080483 bases in 7540250 bytes # masking chrII Writing 15279308 bases in 7639662 bytes # masking chrIII Writing 13783313 bases in 6891665 bytes # masking chrIV Writing 17493791 bases in 8746904 bytes # masking chrV Writing 20922231 bases in 10461124 bytes # masking chrX Writing 17718849 bases in 8859433 bytes # masking chrM Writing 13794 bases in 6905 bytes # create hard masks mkdir hardMask foreach c (I II III IV V X M) echo "masking chr${c}" /cluster/bin/i386/maskOutFa softMask/chr${c}.fa hard \ hardMask/chr${c}.fa end ssh kkr1u00 cd /cluster/data/ce2/softMask mkdir -p /iscratch/i/worms/Celegans2/bothMasksFa mkdir -p /iscratch/i/worms/Celegans2/nib cp -p *.fa /iscratch/i/worms/Celegans2/bothMasksFa cd /cluster/data/ce2/nib cp -p c*.nib /iscratch/i/worms/Celegans2/nib iSync STORING O+O SEQUENCE AND ASSEMBLY INFORMATION (DONE, 2004-04-02 - hartera) # Make symbolic links from /gbdb/ce1/nib to the real nibs. # next machine ssh hgwdev mkdir -p /gbdb/ce2/nib foreach f (/cluster/data/ce2/nib/*.nib) ln -s $f /gbdb/ce2/nib end cd /cluster/data/ce2/tmp # Load /gbdb/ce2/nib paths into database and save size info # hgNibSeq creates chromInfo table hgNibSeq -preMadeNib ce2 /gbdb/ce2/nib /cluster/data/ce2/sangerFa/chr*.fa # Typical output: # Processing /cluster/data/ce2/sangerFa/chrI.fa to /gbdb/ce2/nib/chrI.nib # Processing /cluster/data/ce2/sangerFa/chrII.fa to /gbdb/ce2/nib/chrII.nib # Processing /cluster/data/ce2/sangerFa/chrIII.fa to /gbdb/ce2/nib/chrIII.nib # Processing /cluster/data/ce2/sangerFa/chrIV.fa to /gbdb/ce2/nib/chrIV.nib # Processing /cluster/data/ce2/sangerFa/chrM.fa to /gbdb/ce2/nib/chrM.nib # Processing /cluster/data/ce2/sangerFa/chrV.fa to /gbdb/ce2/nib/chrV.nib # Processing /cluster/data/ce2/sangerFa/chrX.fa to /gbdb/ce2/nib/chrX.nib # 100291769 total bases # Verify the hgNibSeq load functioned OK: hgsql -e "select chrom, size from chromInfo" ce2 > chrom.sizes cat chrom.sizes # chrom.sizes: # chrom size # chrI 15080483 # chrII 15279308 # chrIII 13783313 # chrIV 17493791 # chrM 13794 # chrV 20922231 # chrX 17718849 # MAKE GAP tracks AND LOAD ASSEMBLY FRAGMENTS INTO DATABASE (DONE, 2004-04-05, hartera) # next machine ssh hgwdev mkdir -p /cluster/data/ce2/bed/gap cd /cluster/data/ce2/bed/gap # finds motifs and finds location of gaps as part of output foreach c (I II III IV V X M) findMotif -chr=chr${c} -verbose=4 -motif=gcatg /gbdb/ce2/nib >& chr${c}Bed.stderr end grep -h GAP *.stderr | sed -e "s/#GAP //" > gap.bed # hgGoldGap does not handle dir names over 2 characters # directory III has been moved to 3 when all this was created above hgGoldGapGl -noGl ce2 /cluster/data/ce2 sangerFa # All the gap tables are empty # Load in gap.bed # Need to add extra fields to gap.bed file cat << '_EOF_' > /cluster/data/ce2/jkStuff/createGapFile.pl #!/usr/bin/perl -w use strict; my $oldchr = ""; while () { my @fields = split(/\t/); my $chr = $fields[0]; if ($chr ne $oldchr) { open(OUT, ">$chr"."_gap.bed"); $oldchr = $chr; } print OUT "$fields[0]\t$fields[1]\t$fields[2]\t$fields[3]\tN\t$fields[4]\tfragment\tyes\n"; } '_EOF_' perl /cluster/data/ce2/jkStuff/createGapFile.pl < gap.bed # load into relevant tables foreach c (I II III IV V X M) hgLoadBed -tab -oldTable ce2 chr${c}_gap chr${c}_gap.bed end # CREATE gc5Base wiggle TRACK (DONE, 2004-04-05, hartera) # Perform a gc count with a 5 base window. # Also compute a "zoomed" view for display efficiency. mkdir /cluster/data/ce2/bed/gc5Base cd /cluster/data/ce2/bed/gc5Base # in the script below, the 'grep -w GC' selects the lines of # output from hgGcPercent that are real data and not just some # information from hgGcPercent. The awk computes the number # of bases that hgGcPercent claimed it measured, which is not # necessarily always 5 if it ran into gaps, and then the division # by 10.0 scales down the numbers from hgGcPercent to the range # [0-100]. Two columns come out of the awk print statement: # and which are fed into wigAsciiToBinary through # the pipe. It is set at a dataSpan of 5 because each value # represents the measurement over five bases beginning with # . The result files end up in ./wigData5. cat << '_EOF_' > /cluster/data/ce2/jkStuff/runGcPercent.sh #!/bin/sh mkdir -p wigData5 mkdir -p dataLimits5 for n in /cluster/data/ce2/nib/*.nib do c=`basename ${n} | sed -e "s/.nib//"` C=`echo $c | sed -e "s/chr//"` echo -n "working on ${c} - ${C} ... " hgGcPercent -chr=${c} -doGaps \ -file=stdout -win=5 ce2 /cluster/data/ce2/nib | grep -w GC | \ awk '{printf "%d\t%.1f\n", $2+1, $5/10.0 }' | \ wigAsciiToBinary \ -dataSpan=5 -chrom=${c} -wibFile=wigData5/gc5Base_${C} \ -name=${C} stdin 2> dataLimits5/${c} echo "done" done '_EOF_' chmod +x /cluster/data/ce2/jkStuff/runGcPercent.sh # This is going to take perhaps two hours to run. It is a lot of # data. make sure you do it on the fileserver: ssh eieio cd /cluster/data/ce2/bed/gc5Base /cluster/data/ce2/jkStuff/runGcPercent.sh # load the .wig files back on hgwdev: ssh hgwdev cd /cluster/data/ce2/bed/gc5Base hgLoadWiggle ce2 gc5Base wigData5/*.wig # and symlink the .wib files into /gbdb mkdir /gbdb/ce2/wib ln -s `pwd`/wigData5/*.wib /gbdb/ce2/wib # to speed up display for whole chromosome views, compute a "zoomed" # view and load that on top of the existing table. The savings # comes from the number of data table rows the browser needs to load # for a full chromosome view. Without the zoomed view there are # over 43,000 data rows for chrom 1. With the zoomed view there are # only 222 rows needed for the display. If your original data was # at 1 value per base the savings would be even greater. # Pretty much the same data calculation # situation as above, although this time note the use of the # 'wigZoom -dataSpan=1000 stdin' in the pipeline. This will average # together the data points coming out of the awk print statment over # a span of 1000 bases. Thus each coming out of wigZoom # will represent the measurement of GC in the next 1000 bases. Note # the use of -dataSpan=1000 on the wigAsciiToBinary to account for # this type of data. You want your dataSpan here to be an exact # multiple of your original dataSpan (5*200=1000) and on the order # of at least 1000, doesn't need to go too high. For data that is # originally at 1 base per value, a convenient span is: -dataSpan=1024 # A new set of result files ends up in ./wigData5_1K/*.wi[gb] cat << '_EOF_' > /cluster/data/ce2/jkStuff/runZoom.sh #!/bin/sh mkdir -p wigData5_1K mkdir -p dataLimits5_1K for n in /cluster/data/ce2/nib/*.nib do c=`basename ${n} | sed -e "s/.nib//"` C=`echo $c | sed -e "s/chr//"` echo -n "working on ${c} - ${C} ... " hgGcPercent -chr=${c} -doGaps \ -file=stdout -win=5 ce2 /cluster/data/ce2/nib | grep -w GC | \ awk '{printf "%d\t%.1f\n", $2+1, $5/10.0}' | \ wigZoom -dataSpan=1000 stdin | wigAsciiToBinary \ -dataSpan=1000 -chrom=${c} -wibFile=wigData5_1K/gc5Base_${C}_1K \ -name=${C} stdin 2> dataLimits5_1K/${c} echo "done" done '_EOF_' chmod +x /cluster/data/ce2/jkStuff/runZoom.sh # This is going to take even longer than above, certainly do this # on the fileserver ssh eieio cd /cluster/data/ce2/bed/gc5Base time /cluster/data/ce2/jkStuff/runZoom.sh # 520.000u 19.310s 6:15.09 143.7% 0+0k 0+0io 7875pf+0w # Then load these .wig files into the same database as above ssh hgwdev hgLoadWiggle -oldTable ce2 gc5Base wigData5_1K/*.wig # and symlink these .wib files into /gbdb mkdir /gbdb/ce2/wib ln -s `pwd`/wigData5_1K/*.wib /gbdb/ce2/wib # The browser needs to be fixed so it can display the assembly track # without the need for the gap tables to exist. # MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE FOR CE2 (DONE, 2002-04-05, hartera) # next machine ssh hgwdev # Make trackDb table so browser knows what to expect: cd $HOME/kent/src/hg/makeDb/trackDb cvs up -d -P # Edit that makefile to add ce2 in all the right places and do make update make alpha cvs commit -m "Added ce2" makefile # Add dbDb and defaultDb entries: echo 'insert into dbDb (name, description, nibPath, organism, \ defaultPos, active, orderKey, genome, scientificName, \ htmlPath, hgNearOk) \ values("ce2", "March 2004", \ "/gbdb/ce2/nib", "Worm", "chrII:14642289-14671631", 1, \ 60, "C. elegans", "Caenorhabditis elegans", \ "/gbdb/ce2/html/description.html", 0);' \ | hgsql -h genome-testdb hgcentraltest echo 'update defaultDb set name = "ce2" where genome = "C. elegans"' \ | hgsql -h genome-testdb hgcentraltest # MAKE DESCRIPTION/SAMPLE POSITION HTML PAGE (DONE, 2004-04-05, hartera) ssh hgwdev mkdir /cluster/data/ce2/html cd /cluster/data/ce2/html # make a symbolic link from /gbdb/ce2/html to /cluster/data/ce2/html ln -s /cluster/data/ce2/html /gbdb/ce2/html # Write a description.hmtl - copy from /cluster/data/ce1/html/ # with a description of the assembly and some sample position queries. # create ce2 dir in /trackDb/worm and commit to CVS mkdir ~/kent/src/hg/makeDb/trackDb/worm/ce2 cvs add ce2 cvs commit ce2 # Add this also to ~/kent/src/hg/makeDb/trackDb/worm/ce2/description.html chmod a+r $HOME/kent/src/hg/makeDb/trackDb/worm/ce2/description.html # Check it in and copy (ideally using "make alpha" in trackDb) to # /gbdb/ce2/html cvs commit description.html # MAKE SANGER GENE (WORM BASE GENES) TRACK (DONE, 2004-04-09, hartera) # ADD MISSING ORFS AND PROTEIN NAMES TO sangerLinks TABLE AND # ADD proteinID COLUMN TO SANGERGENE TABLE FOR USE BY THE # FAMILY BROWSER (DONE, 2004-05-20, hartera) # next machine ssh eieio mkdir -p /cluster/data/ce2/bed/sangerGene cd /cluster/data/ce2/bed/sangerGene # the perl trims these files down to a reasonable size. As they are # they cause ldHgGene to hangup due to memory size. foreach f (I II III IV V X M) echo -n "${f} " zcat ../../sanger/CHROMOSOME_${f}.gff.gz | \ sed -e "s/CHROMOSOME_III/chrIII/g" -e "s/CHROMOSOME_II/chrII/g" \ -e "s/CHROMOSOME_IV/chrIV/g" -e "s/CHROMOSOME_I/chrI/g" \ -e "s/CHROMOSOME_X/chrX/g" -e "s/CHROMOSOME_V/chrV/g" \ -e "s/CHROMOSOME_M/chrM/g" \ -e 's/Sequence "\(.*\)"$/\1/' -e 's/Transcript "\(.*\)"$/\1/' | \ perl -ne '@a = split; \ print if($a[1] =~ /curated|DNA|RNA/i && \ $a[2] =~ /intron|exon|cds|sequence|transcri/i);' > chr${f}.gff end echo # remove CDS before id e.g. from CDS "Y74C9A.2" and remove " perl -pi.bak -e 's/CDS "//' chr*.gff perl -pi.bak -e 's/"//' chr*.gff # check this worked ok then cleanup rm *.bak # check file sizes, should be reasonable ls -ogrt # total 23352 # -rw-rw-r-- 1 3430979 Apr 9 16:09 chrIII.gff # -rw-rw-r-- 1 3885105 Apr 9 16:09 chrII.gff # -rw-rw-r-- 1 3680753 Apr 9 16:09 chrI.gff # -rw-rw-r-- 1 4926390 Apr 9 16:09 chrV.gff # -rw-rw-r-- 1 3635 Apr 9 16:09 chrM.gff # -rw-rw-r-- 1 3854560 Apr 9 16:09 chrIV.gff # -rw-rw-r-- 1 4085489 Apr 9 16:09 chrX.gff # now load database with those transformed gff files # next machine ssh hgwdev cd /cluster/data/ce2/bed/sangerGene # 2004-05-10, hartera, Reload sangerGene table using extended GenePred # format. 2004-05-11, hartera. Extended format frame information does not # look correct. Reload without the extended fields. ldHgGene ce2 sangerGene *.gff # Read 41259 transcripts in 423845 lines in 7 files # 41259 groups 7 seqs 11 sources 6 feature types # 23076 gene predictions # if you need to delete that table: echo 'drop table sangerGene' | hgsql ce2 # worm/ce2 trackDb.ra entry # track sangerGene # shortLabel WormBase Genes # longLabel WormBase Gene Annotations # group genes # priority 48 # visibility pack # color 0,0,200 # chromosomes chrI,chrII,chrIII,chrIV,chrV,chrX,chrM # type genePred sangerPep # url http://www.wormbase.org/db/gene/gene?name=$$ # hgGene on # Add proteinID field to sangerGene table, used by Gene Sorter ssh hgwdev cd /cluster/data/ce2/bed/sangerGene hgsql -e 'alter table sangerGene add proteinID varchar(40) NOT NULL;' ce2 # To add index on this column hgsql -e 'alter table sangerGene add index(proteinID);' ce2 # Add Swiss-Prot protein IDs to this column # There are 23076 entries in sangerGene and 21780 of the names # are in sangerLinks as orfName hgsql -N -e 'select count(*) from sangerGene as g,sangerLinks as l \ where g.name = l.orfName;' ce2 # 21780 # get names from sangerGene and sangerLinks tables hgsql -N -e "select name from sangerGene;" ce2 | sort > name.sangerGene hgsql -N -e "select orfName from sangerLinks;" ce2 | sort > orfName.sangerLinks # get list of names in sangerGene not in sangerLinks comm -23 name.sangerGene orfName.sangerLinks > geneNames.notin.sangerLinks # Go to the WS120 WormBase mirror site and check SwissProt IDs # http://http://ws120.wormbase.org/db/searches/info_dump # Some of these genes are non coding RNAs or have no SwissProt ID BUT # others in the geneNames.notin.sangerLinks list do have a Swiss-Prot ID # Download IDs in using batches of about 400 for gene names from # geneNames.notin.sangerLinks and add to file, SPIds.wormPep.WS120 # Create a perl script to parse out Swiss-Prot IDs for those genes that # have them and create the sql statements to insert them into sangerLinks cat << '_EOF_' > getSPIDsandLoad.pl #!/usr/bin/perl -w use strict; my %SPhash; my $found = "false"; my $gene = ""; my $SP = ""; open(SP, ">addSPIds.sql") || die "Can not create addSPIds.sql: $!"; while() { chomp; my @fields = split(/\t/); if ($fields[0] =~ /^Gene:/) { $gene = $fields[1]; } elsif (($fields[0] =~ /^GenPep:/) && ($gene ne "n/a") ){ # if there are 2 fields then store second as Swiss-Prot ID if ($#fields == 1) { $SP = $fields[1]; } # add gene and Swiss-Prot ID to hash $SPhash{$gene} = $SP; $gene = ""; $SP = ""; } } foreach my $g (keys(%SPhash) ) { my $s = $SPhash{$g}; # print if there is a Swiss-Prot ID if ($s =~ /^[A-Z]{1}[0-9]+/) { print "Gene: $g\tSP ID: $s\n"; # create mySQL to add this to the sangerLinks table; print SP "INSERT into sangerLinks (orfName, protName, description) VALUES(\"$g\",\"$s\",\"\");\n" } } close SP; '_EOF_' # run perl script perl getSPIDsandLoad.pl < SPIds.wormPep.WS120 > orfsandSPIds.out # There are 111 of these genes with SwissProt IDs # To insert new orfNames and SwissProt IDs into table: hgsql ce2 < addSPIds.sql # Now 21891 names in sangerGene that match orfNames in sangerLinks # Use SwissProt IDs in sangerLinks table to fill proteinID # column in the sangerGene table hgsql -e 'update sangerGene as g, sangerLinks as l \ set g.proteinID = l.protName where g.name = l.orfName;' ce2 # check there are 21891 rows with the protName field filled # MYTOUCH FIX - Jen - 2006-01-24 sudo mytouch ce2 orfToGene 0505031600.00 sudo mytouch ce2 sangerBlastTab 0505031600.00 sudo mytouch ce2 sangerBlastTab 0505031600.00 sudo mytouch ce2 sangerCanonical 0505031600.00 sudo mytouch ce2 sangerIsoforms 0505031600.00 sudo mytouch ce2 sangerLinks 0505031600.00 sudo mytouch ce2 sangerPep 0505031600.00 sudo mytouch ce2 sangerToKim 0505031600.00 sudo mytouch ce2 sangerToPfam 0505031600.00 sudo mytouch ce2 sangerToRefSeq 0505031600.00 # MAKE WORMBASE GENEFINDER TRACKS (DONE, 2004-04-12, hartera) # next machine ssh eieio mkdir -p /cluster/data/ce2/bed/sangerGenefinder cd /cluster/data/ce2/bed/sangerGenefinder # the perl trims these files down to a reasonable size. As they are # they cause ldHgGene to hangup due to memory size. foreach f (I II III IV V X M) echo -n "${f} " zcat ../../sanger/CHROMOSOME_${f}.gff.gz | \ sed -e "s/CHROMOSOME_III/chrIII/g" -e "s/CHROMOSOME_II/chrII/g" \ -e "s/CHROMOSOME_IV/chrIV/g" -e "s/CHROMOSOME_I/chrI/g" \ -e "s/CHROMOSOME_X/chrX/g" -e "s/CHROMOSOME_V/chrV/g" \ -e "s/CHROMOSOME_M/chrM/g" \ -e 's/Sequence "\(.*\)"$/\1/' -e 's/Transcript "\(.*\)"$/\1/' | \ perl -ne '@a = split; \ print if($a[1] =~ /Genefinder/i && \ $a[2] =~ /intron|exon|cds|sequence|transcri/i);' > chr${f}.gff end echo # remove CDS before id e.g. from CDS "Y74C9A.2" and remove " perl -pi.bak -e 's/CDS "//' chr*.gff perl -pi.bak -e 's/"//' chr*.gff # check this worked ok then cleanup rm *.bak # check file sizes, should be reasonable ls -ogrt # total 23712 # -rw-rw-r-- 1 3984558 Apr 12 09:14 chrII.gff # -rw-rw-r-- 1 3702836 Apr 12 09:14 chrI.gff # -rw-rw-r-- 1 0 Apr 12 09:14 chrM.gff # -rw-rw-r-- 1 3948579 Apr 12 09:14 chrIV.gff # -rw-rw-r-- 1 3364561 Apr 12 09:14 chrIII.gff # -rw-rw-r-- 1 5303266 Apr 12 09:14 chrV.gff # -rw-rw-r-- 1 3929597 Apr 12 09:14 chrX.gff # now load database with those transformed gff files # next machine ssh hgwdev cd /cluster/data/ce2/bed/sangerGenefinder # 2004-05-10, hartera, Reload sangerGene table using extended GenePred # format. 2004-05-11, hartera. Extended format frame information does not # look correct. Reload without the extended fields. ldHgGene ce2 sangerGenefinder *.gff # Read 36976 transcripts in 401688 lines in 7 files # 36976 groups 6 seqs 1 sources 4 feature types # 21180 gene predictions # if you need to delete that table: echo 'drop table sangerGenefinder' | hgsql ce2 # RUN Waba alignment with briggsae (WORKING - 2004-04-06 - Hiram) # prepare contigs from C. briggsae # Assumes C. briggsae data has been downloaded according to # makeCb1.doc # using briggsae contigs from previous work: /iscratch/i/worms/Cbriggsae/contigs # next machine ssh kk mkdir -p /cluster/data/ce2/bed/waba/out cd /cluster/data/ce2/bed/waba ls -1S /iscratch/i/worms/Cbriggsae/contigs/c*.fa > briggsae.lst ls -1S /iscratch/i/worms/Celegans2/unmaskedFa/chr*.fa > elegans.lst cat elegans.lst | while read FN do b=`basename ${FN}` mkdir out/${b%%.fa} done # create scripts to be used here cat << '_EOF_' > wabaRun #!/bin/csh -fe # # $1 - full pathname to a briggsae contig # $2 - file path to an elegans chrom.fa # $3 - result file full pathname # set f = $1:t set chr = $2:t set d = $chr:r mkdir -p /tmp/$d/$f cp $1 /tmp/$d/$f pushd . cd /tmp/$d/$f set t = $f:r /cluster/home/hiram/bin/i386/waba 1 $f $2 $t.1 /cluster/home/hiram/bin/i386/waba 2 $t.1 $t.2 /cluster/home/hiram/bin/i386/waba 3 $t.2 $t.wab cp $t.wab $3 popd rm -f /tmp/$d/$f/$t.* rmdir --ignore-fail-on-non-empty /tmp/$d/$f rmdir --ignore-fail-on-non-empty /tmp/$d '_EOF_' chmod +x wabaRun cat << '_EOF_' > jobTemplate #LOOP /cluster/store5/worm/ce2/bed/waba/wabaRun {check in exists+ $(path1)} {check in exists+ $(path2)} {check out exists /cluster/store5/worm/ce2/bed/waba/out/$(root2)/$(root1).wab} #ENDLOOP '_EOF_' gensub2 briggsae.lst elegans.lst jobTemplate jobList para create jobList para try para check para push # one of the jobs takes quite a while. Most of the others are OK: Completed: 6950 of 6951 jobs Crashed: 1 jobs CPU time in finished jobs: 19284472s 321407.87m 5356.80h 223.20d 0.612 y IO & Wait Time: 63556s 1059.26m 17.65h 0.74d 0.002 y Average job time: 2784s 46.40m 0.77h 0.03d Longest job: 47017s 783.62m 13.06h 0.54d Submission to last job: 58555s 975.92m 16.27h 0.68d # The failed job is: # /cluster/store5/worm/ce2/bed/waba/wabaRun \ # /iscratch/i/worms/Cbriggsae/contigs/c0907.fa \ # /iscratch/i/worms/Celegans2/unmaskedFa/chrI.fa \ # /cluster/store5/worm/ce2/bed/waba/out/chrI/c0907.wab XXXX - running 2004-04-07 - retrying this stand-along on kkr1u00 # after about 1.5 hours fails with the error: # waba: xenbig.c:550: mergeTwo: Assertion `c->qStart == qStart && # c->qEnd == qEnd' failed. # Abort # Ignoring that error, and moving on # next machine ssh hgwdev cd /cluster/data/ce2/bed/waba mkdir Load cd Load # The cat through the pipe to hgWaba will avoid making large files # that are not needed. cat << '_EOF_' > loadEm.sh #!/bin/sh # for c in I II III IV V X M do echo -n "${c} " cat /cluster/store5/worm/ce2/bed/waba/out/chr${c}/c*.wab | /cluster/bin/i386/hgWaba ce2 Cbr chr${c} 0 stdin > proc${c}.out 2>&1 done exit 0 '_EOF_' chmod +x loadEm.sh # run it to load the waba track: ./loadEm.sh # remove garbage temp file: rm full_waba.tab chrom_waba.tab # worm/ce2/trackDb.ra entry: # track cbrWaba # shortLabel Briggsae Waba # longLabel C. briggsae Waba Alignments # group compGeno # priority 20 # visibility dense # color 140,0,200 # altColor 210,140,250 # MAKE 2BIT FILE FOR BLAT (DONE, 2004-04-07, hartera) # Make one big 2bit file as well, and make a link to it in # /gbdb/ce2/nib because hgBlat looks there: ssh eieio cd /cluster/data/ce2 faToTwoBit softMask/chr*.fa ce2.2bit ssh hgwdev ln -s /cluster/data/ce2/ce2.2bit /gbdb/ce2/nib/ # MAKE 11.OOC FILE FOR BLAT (DONE, 2004-04-07, hartera) # Use -repMatch=40 (based on size, for human use 1024) ssh kkr1u00 mkdir /cluster/bluearc/ce2 mkdir /cluster/data/ce2/bed/ooc cd /cluster/data/ce2/bed/ooc ls -1 /cluster/data/ce2/nib/chr*.nib > nib.lst /cluster/bin/i386/blat nib.lst /dev/null /dev/null -tileSize=11 \ -makeOoc=/cluster/bluearc/ce2/11.ooc -repMatch=40 # Writing /cluster/bluearc/ce2/11.ooc # Wrote 43676 overused 11-mers to /cluster/bluearc/ce2/11.ooc cp -p /cluster/bluearc/ce2/11.ooc /iscratch/i/worms/Celegans2/ iSync # AUTO UPDATE GENBANK MRNA AND EST RUN (DONE, 2004-04-07, hartera) # next machine ssh hgwdev # Update genbank config and source in CVS: cd ~/kent/src/hg/makeDb/genbank cvsup . # See if /cluster/data/genbank/etc/genbank.conf has had any un-checked-in # edits, check them in if necessary: diff /cluster/data/genbank/etc/genbank.conf etc/genbank.conf # Edit etc/genbank.conf: default includes native genbank mRNAs and ESTs, # genbank xeno mRNAs but no ESTs, native RefSeq mRNAs but not xeno RefSeq # Add these lines: # ce2 (C. elegans) ce2.genome = /iscratch/i/worms/Celegans2/nib/chr*.nib ce2.lift = no ce2.downloadDir = ce2 cvs commit -m "Added ce2" etc/genbank.conf make # markd added -maxIntron=100000 for ce to genbank/src/align/gbBlat # Edit src/align/gbBlat to add /iscratch/i/worms/Celegans2/11.ooc cvs diff src/align/gbBlat make cvs commit -m "Added 11.ooc for ce2" src/align/gbBlat # Install to /cluster/data/genbank: make install-server # next machine ssh eieio cd /cluster/data/genbank # This is an -initial run, RefSeq mRNA only: nice bin/gbAlignStep -srcDb=refseq -type=mrna -verbose=1 -initial ce2 # Load results for RefSeq mRNAs: ssh hgwdev cd /cluster/data/genbank nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad ce2 # To start next run, results need to be moved out the way cd /cluster/data/genbank/work mv initial.ce2 initial.ce2.refseq.mrna # -initial for GenBank mRNAs ssh eieio cd /cluster/data/genbank nice bin/gbAlignStep -srcDb=genbank -type=mrna -verbose=1 -initial ce2 # Load results for GenBank mRNAs ssh hgwdev cd /cluster/data/genbank nice bin/gbDbLoadStep -verbose=1 ce2 cd /cluster/data/genbank/work mv initial.ce2 initial.ce2.genbank.mrna # -initial for ESTs ssh eieio cd /cluster/data/genbank nice bin/gbAlignStep -srcDb=genbank -type=est -verbose=1 -initial ce2 # Load results for GenBank ESTs ssh hgwdev cd /cluster/data/genbank nice bin/gbDbLoadStep -verbose=1 ce2 cd /cluster/data/genbank/work mv initial.ce2 initial.ce2.genbank.est # Everything loaded ok, so clean up rm -r /cluster/data/genbank/work/initial.ce2* # REMOVE XENO mRNAs (DONE, 2004-04-09, hartera) # Remove xeno mRNAs, too many are distantly related and do # not have meaningful alignments using Blat ssh hgwdev # Remove all mrna.xeno.* files from # /cluster/data/genbank/data/aligned/genbank.140.0/ce2 # edit /cluster/data/genbank/etc/genbank.conf cd ~/kent/src/hg/makeDb/genbank # add line to ce2: ce2.genbank.mrna.xeno.load = no # so now it is: # ce2 (C. elegans) ce2.genome = /iscratch/i/worms/Celegans2/nib/chr*.nib ce2.lift = no ce2.genbank.mrna.xeno.load = no ce2.downloadDir = ce2 make cvs update etc/genbank.conf cvs commit etc/genbank.conf make install-server cd /cluster/data/genbank/ # reload only the native mRNAs to the database nice bin/gbDbLoadStep -reload -type=mrna -srcDb=genbank -verbose=1 ce2 # drop xenoMrna table echo 'drop table xenoMrna' | hgsql ce2 # PRODUCING FUGU BLAT ALIGNMENTS (DONE, 2004-04-07, hartera) # Assumes masked NIBs have been prepared as above # and Fugu pieces are already on kluster /iscratch/i. # next machine ssh kk mkdir /cluster/data/ce2/bed/blatFr1 cd /cluster/data/ce2/bed/blatFr1 ls -1S /iscratch/i/fugu/trfFa/*.fa > fugu.lst ls -1S /iscratch/i/worms/Celegans2/nib/*.nib > worm.lst cat << '_EOF_' > gsub #LOOP blat -mask=lower -q=dnax -t=dnax {check in exists $(path1)} {check in line+ $(path2)} {check out line+ psl/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy mkdir psl gensub2 worm.lst fugu.lst gsub spec para create spec para try, check, push, check, ... # para time # Completed: 4046 of 4046 jobs # CPU time in finished jobs: 263359s 4389.31m 73.16h 3.05d 0.008 y # IO & Wait Time: 13317s 221.95m 3.70h 0.15d 0.000 y # Average job time: 68s 1.14m 0.02h 0.00d # Longest job: 1103s 18.38m 0.31h 0.01d # Submission to last job: 2170s 36.17m 0.60h 0.03d # When cluster run is done, sort alignments # next machine ssh eieio cd /cluster/data/ce2/bed/blatFr1 foreach d (I II III IV V X M) echo -n "${d} " pslCat psl/chr${d}_*.psl | \ pslSortAcc nohead chrom temp stdin rm -f chrom/chr${d}_blatFr1.psl mv chrom/chr${d}.psl chrom/chr${d}_blatFr1.psl end # next machine ssh hgwdev # Load Fugu Blat alignments cd /cluster/data/ce2/bed/blatFr1/chrom cat *.psl | hgLoadPsl -fastLoad -table=blatFr1 ce2 stdin # Make fugu /gbdb/ symlink and load Fugu sequence data. mkdir /gbdb/ce2/fuguSeq ln -s cluster/data/fr1/fugu_v3.masked.fa /gbdb/ce2/fuguSeq cd /cluster/data/ce2/bed/blatFr1 hgLoadSeq ce2 /gbdb/ce2/fuguSeq/fugu_v3.masked.fa # Adding /gbdb/ce2/fuguSeq/fugu_v3.masked.fa # 20379 sequences # worm/ce2 trackDb.ra entry # track blatFr1 # shortLabel Fugu Blat # longLabel Takifugu rubripes (fr1) Translated Blat Alignments # group compGeno # priority 110 # visibility dense # color 0,60,120 # altColor 200,220,255 # spectrum on # type psl xeno # BLASTZ C. Briggsae (Cb1) (DONE, 2004-04-13, hartera) # next machine ssh kkr1u00 # blastz requires lineage-specific repeats but there are none for the worms # so create empty files for each chromsome and iSync mkdir -p /iscratch/i/worms/Celegans2/linSpecRep.notinCbriggsae cd /iscratch/i/worms/Celegans2/linSpecRep.notinCbriggsae # create chrI.out.spec and cp to chrN.out.spec for chrII, chrIII, chrIV, chrV, chrX, chrM mkdir -p /iscratch/i/worms/Cbriggsae/linSpecRep.notinCelegans # create empty chrUn.out.spec file iSync ssh kk mkdir -p /cluster/data/ce2/bed/blastz.cb1.2004-04-12 cd /cluster/data/ce2/bed ln -s blastz.cb1.2004-04-12 blastz.cb1 cd blastz.cb1 cat << '_EOF_' > DEF # C. elegans vs. C. briggsae export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/home/angie/schwartzbin:/cluster/bin/i386 ALIGN=blastz-run BLASTZ=blastz BLASTZ_H=2000 #BLASTZ_ABRIDGE_REPEATS=1 # when SMSK=/dev/null BLASTZ_ABRIDGE_REPEATS=0 # TARGET SEQ1_DIR=/iscratch/i/worms/Celegans2/nib # RMSK not currently used SEQ1_RMSK=/iscratch/i/worms/Celegans2/rmsk SEQ1_SMSK=/iscratch/i/worms/Celegans2/linSpecRep.notinCbriggsae # FLAG not currently used SEQ1_FLAG=-worm SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY SEQ2_DIR=/iscratch/i/worms/Cbriggsae/bothMasksNib # RMSK not currently used SEQ2_RMSK=/iscratch/i/worms/Cbriggsae/rmsk # FLAG not currently used SEQ2_SMSK=/iscratch/i/worms/Cbriggsae/linSpecRep.notinCelegans SEQ2_FLAG=-worm SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=30000000 SEQ2_LAP=0 BASE=/cluster/data/ce2/bed/blastz.cb1 DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len #DEBUG=1 '_EOF_' # << this line keeps emacs coloring happy # Save the DEF file in the current standard place cp DEF ~angie/hummus/DEF.ce2-cb1.2004-04-12 # Need shell scripts from mm4 to do cluster runs mv /cluster/data/mm4/jkStuff/BlastZ*.sh /cluster/data/ce2/jkStuff/ # prepare first cluster run ssh kk cd /cluster/data/ce2/bed/blastz.cb1 source DEF /cluster/data/ce2/jkStuff/BlastZ_run0.sh cd run.0 para try, check, push, check, .... # para time # Completed: 56 of 56 jobs # CPU time in finished jobs: 155723s 2595.39m 43.26h 1.80d 0.005 y # IO & Wait Time: 6387s 106.45m 1.77h 0.07d 0.000 y # Average job time: 2895s 48.25m 0.80h 0.03d # Longest job: 6137s 102.28m 1.70h 0.07d # Submission to last job: 11078s 184.63m 3.08h 0.13d # Second cluster run to convert the .out's to .lav's cd /cluster/data/ce2/bed/blastz.cb1 source DEF /cluster/data/ce2/jkStuff/BlastZ_run1.sh cd run.1 para try, check, push, etc ... # para time # Completed: 14 of 14 jobs # CPU time in finished jobs: 762s 12.70m 0.21h 0.01d 0.000 y # IO & Wait Time: 62s 1.03m 0.02h 0.00d 0.000 y # Average job time: 59s 0.98m 0.02h 0.00d # Longest job: 115s 1.92m 0.03h 0.00d # Submission to last job: 293s 4.88m 0.08h 0.00d # Third cluster run to convert lav's to axt's cd /cluster/data/ce2/bed/blastz.cb1 source DEF /cluster/data/ce2/jkStuff/BlastZ_run2.sh cd run.2 para try para check # only 7 jobs so all completed by para try # para time # Completed: 7 of 7 jobs # CPU time in finished jobs: 187s 3.11m 0.05h 0.00d 0.000 y # IO & Wait Time: 169s 2.82m 0.05h 0.00d 0.000 y # Average job time: 51s 0.85m 0.01h 0.00d # Longest job: 77s 1.28m 0.02h 0.00d # Submission to last job: 77s 1.28m 0.02h 0.00d # translate sorted axt files into psl ssh eieio cd /cluster/data/ce2/bed/blastz.cb1 mkdir -p pslChrom set tbl = "blastzCb1" foreach f (axtChrom/chr*.axt) set c=$f:t:r echo "Processing chr $c" /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl end # Load database tables ssh hgwdev cd /cluster/data/ce2/bed/blastz.cb1/pslChrom foreach d (I II III IV V X M) /cluster/bin/i386/hgLoadPsl -noTNameIx ce2 chr${d}_blastzCb1.psl end # CHAIN Cb1 BLASTZ (DONE, 2004-04-30, hartera) # CHAINS ARE RE-DONE DUE TO BUG IN axtChain which has now been fixed by Jim ssh kk mkdir -p /cluster/data/ce2/bed/blastz.cb1/axtChain.2004-04-29/run1 cd /cluster/data/ce2/bed/blastz.cb1/axtChain.2004-04-29/run1 mkdir out chain ls -1S /cluster/data/ce2/bed/blastz.cb1/axtChrom/*.axt > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy cat << '_EOF_' > doChain #!/bin/csh axtFilter $1 | /cluster/home/hartera/bin/i386/axtChain stdin \ /iscratch/i/worms/Celegans2/nib \ /iscratch/i/worms/Cbriggsae/bothMasksNib $2 >& $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain gensub2 input.lst single gsub jobList para create jobList # only 7 jobs so all done by para try para try para check # para time # Completed: 7 of 7 jobs # CPU time in finished jobs: 1265s 21.08m 0.35h 0.01d 0.000 y # IO & Wait Time: 69s 1.16m 0.02h 0.00d 0.000 y # Average job time: 191s 3.18m 0.05h 0.00d # Longest job: 632s 10.53m 0.18h 0.01d # Submission to last job: 632s 10.53m 0.18h 0.01d # now on the file server, sort chains ssh eieio cd /cluster/data/ce2/bed/blastz.cb1/axtChain.2004-04-29 time chainMergeSort run1/chain/*.chain > all.chain # User 37.040u # System 3.090s # Elapsed Real 0:42.54 time chainSplit chain all.chain # User 37.310u # System 2.930s # Elapsed Real 0:43.85 # Load chains into database # next machine ssh hgwdev cd /cluster/data/ce2/bed/blastz.cb1/axtChain.2004-04-29/chain foreach i (*.chain) set c = $i:r hgLoadChain ce2 ${c}_chainCb1 $i echo done $c end # NET Cb1 (DONE, 2004-04-30, hartera) # REMAKE NET Cb1 WITH NEW CHAINS ssh eieio cd /cluster/data/ce2/bed/blastz.cb1/axtChain.2004-04-29 mkdir preNet mv /cluster/data/ce2/tmp/chrom.sizes /cluster/data/ce2/ # remove header line in file so chainPreNet will work cd chain foreach i (*.chain) echo preNetting $i /cluster/bin/i386/chainPreNet $i /cluster/data/ce2/chrom.sizes \ /cluster/data/cb1/chrom.sizes ../preNet/$i end cd .. mkdir n1 cd preNet foreach i (*.chain) set n = $i:r.net echo primary netting $i /cluster/bin/i386/chainNet $i -minSpace=1 /cluster/data/ce2/chrom.sizes \ /cluster/data/cb1/chrom.sizes ../n1/$n /dev/null end cd .. cat n1/*.net | /cluster/bin/i386/netSyntenic stdin hNoClass.net # memory usage 84246528, utime 710 s/100, stime 70 ssh hgwdev cd /cluster/data/ce2/bed/blastz.cb1/axtChain.2004-04-29 # add classification info using db tables # use -noAr option - don't look for ancient repeats time netClass -noAr hNoClass.net ce2 cb1 cb1.net # User 11.860u # System 2.090s # Elapsed Real 0:16.24 # Load the nets into database ssh hgwdev cd /cluster/data/ce2/bed/blastz.cb1/axtChain.2004-04-29 netFilter -minGap=10 cb1.net | hgLoadNet ce2 netCb1 stdin # MAKE axtNet TRACK (DONE, 2004-04-30, hartera) # TRACK REMADE WITH NEW CHAINS # Move back to the file server to create axt files corresponding to the net ssh kksilo cd /cluster/data/ce2/bed/blastz.cb1/axtChain.2004-04-29 # try without netFilter as not working well # netFilter -minGap=10 cb1.net > cb1Filt.net mkdir /cluster/data/ce2/bed/blastz.cb1/axtNet.2004-04-29 netSplit cb1.net cb1NetSplit cd cb1NetSplit foreach i (*.net) set c = $i:r netToAxt -maxGap=300 $i \ /cluster/data/ce2/bed/blastz.cb1/axtChain.2004-04-29/chain/$c.chain \ /iscratch/i/worms/Celegans2/nib \ /iscratch/i/worms/Cbriggsae/bothMasksNib \ /cluster/data/ce2/bed/blastz.cb1/axtNet.2004-04-29/$c.axt echo done ../axtNet.2004-04-29/$c.axt end cd .. rm -r cb1NetSplit # sort axt files based on target position mkdir /cluster/data/ce2/bed/blastz.cb1/axtNet.2004-04-29/sortedaxtNet cd /cluster/data/ce2/bed/blastz.cb1/axtNet.2004-04-29 foreach i (*.axt) set c = $i:r axtSort $c.axt ./sortedaxtNet/$c.axt echo done sorting $c.axt end # Load up the axtNet (alignment score wiggle) track: ssh hgwdev mkdir /gbdb/ce2/axtNetCb1 foreach f (/cluster/data/ce2/bed/blastz.cb1/axtNet.2004-04-29/sortedaxtNet/chr*.axt) ln -s $f /gbdb/ce2/axtNetCb1 end cd /cluster/data/ce2/bed/blastz.cb1/axtNet.2004-04-29/sortedaxtNet ${HOME}/bin/i386/hgLoadAxt ce2 axtNetCb1 # MAKE HGCENTRALTEST BLATSERVERS ENTRY FOR CE2 (DONE, 2003-04-13, hartera) # next machine ssh hgwdev # DNA port is "0", trans prot port is "1" # 8/26/04: set canPcr=1 for untranslated blat server. echo 'insert into blatServers values("ce2", "blat1", 17781, 0, 1); \ insert into blatServers values("ce2", "blat1", 17780, 1, 0);' \ | hgsql -h genome-testdb hgcentraltest # if you need to delete those entries echo 'delete from blatServers where db="ce2";' \ | hgsql -h genome-testdb hgcentraltest # to check the entries: echo 'select * from blatServers where db="ce2";' \ | hgsql -h genome-testdb hgcentraltest # CREATE ORFTOGENE TABLE (Used by hgGene and hgNear) (DONE, 2004-04-27, hartera) mkdir /cluster/data/ce2/bed/orfToGene cd !$ # gene_names.txt for WS120 was created and provided by todd.harris@cshl.edu # ORF e.g. Y110A7A.10 gene e.g. aap-1 awk -F '\t' '$2 == "Caenorhabditis elegans" && $3 == "Gene" {printf("%s\t%s\n", $4, $1)}' gene_names.txt > orfToGene.txt # reformat this for use in creating Sanger Links with hgWormLinks awk 'NF == 2' orfToGene.txt > orfToGene.txt2 # use perl script to move ORFs with the same gene name onto separate lines # and remove lines where there is a gene name with an alternate name # in parentheses. this output file is used for Sanger Links /cluster/data/ce2/jkStuff/formatorfToGene.pl < orfToGene.txt2 > orfToGene.tab2 hgCeOrfToGene ce2 gene_names.txt sangerGene orfToGene # DOWNLOAD WORMBASE FOR WS120 RELEASE (DONE, 2004-04-23, hartera) # REMOVED acedb DIRECTORY AS NO LONGER NEEDED (DONE, 2008-08-18, hartera) ssh eieio mkdir -p /cluster/store7/acedb cd /cluster/store7/acedb wget -o ce2.fetchws120.log -r -l1 --timestamp --no-directories \ ftp://ftp.sanger.ac.uk/pub/wormbase/FROZEN_RELEASES/WS120/ gunzip *.gz tar xvf *.tar # 2008-08-18, hartera. Remove the acedb directory. cd /cluster/store7 rm -rf acedb # CREATE SANGER LINKS TABLE FROM WORMBASE INFO (used by hgGene and hgNear) # (DONE, 2004-05-03, hartera) mkdir /cluster/data/ce2/bed/steinHelp # Mail Lincoln Stein and ask for mappings from wormPep to # SwissProt and from ORF to wormPep to description. Place # these files in the steinHelp directory: # swissprot_dump.txt - contains ORF/WormPep/Description # and contains SwissProt Ids and Accessions # Jorge downloaded and installed the software for using AceDB # to /cluster/bin/acedb/ # To use ssh hgwdev cd /cluster/store7/acedb # can use /cluster/bin/acedb/xace . to get X-Window query interface # To use AceDB Query Language (AQL) cd /cluster/store7/acedb /cluster/bin/acede/tace . # At prompt, can type ? for help, type the following queries # wbGeneClass.txt - associates 3 letter first part of worm gene # name with a brief description. acedb> AQL -o wbGeneClass.txt select l,l->Description from l in class Gene_Clas # wbConcise.txt - associates worm gene name with a several # sentence description. acedb> AQL -o wbConcise.txt select l,g from l in class Locus, g in l->Gene_information[Provisional_description] # remove "" in wbGeneClass.txt and wbConcise.txt sed -e 's/"//g;' wbGeneClass.txt > wbGeneClass.new sed -e 's/"//g;' wbConcise.txt > wbConcise.new mv wbGeneClass.new wbGeneClass.txt mv wbConcise.new wbConcise.txt # first remove header lines sed -e 's/Format//' wbConcise.txt | sed -e 's/Locus[:]*//' > tmp sed -e 's/Text//' tmp > tmp2 mv tmp2 wbConcise.txt rm tmp sed -e 's/Format//' wbGeneClass.txt | sed -e 's/Gene_class[:]*//' > tmp sed -e 's/Text//' tmp > tmp2 mv tmp2 wbGeneClass.txt rm tmp # get SwissProt and TrEMBL Ids and accessions from ftp site # TrEMBL Ids missing from swissprot_dump.txt # Need these for details pages to work for sangerGene track wget -o ce2.fetch.log -r -l1 --no-directories --timestamping \ ftp://ftp.sanger.ac.uk/pub/databases/wormpep/old_wormpep120/wormpep.table120 # parse out CDS to Swiss-Prot IDs mappings cat << '_EOF_' > /cluster/data/ce2/jkStuff/getCDSandSPId.pl #!/usr/bin/perl -w use strict; my $found = "false"; while () { chomp; my @fields = split(/\t/); foreach my $f (@fields) { # get CDS name if ($f =~ /^>([0-9A-Za-z\.]+)/) { print "$1\t"; $found = "false"; } elsif ($f =~ /^(TR:|SP:)([0-9A-Z]{4,})$/) { print $2; $found = "true"; } } if ($found eq "false") { print "NULL"; } print "\n"; } '_EOF_' chmod a+x /cluster/data/ce2/jkStuff/getCDSandSPId.pl perl /cluster/data/ce2/jkStuff/getCDSandSPId.pl < wormpep.table120 > spCdsToId.txt # rewrote hgWormLinks to extract information from these files hgWormLinks spCdsToId.txt swissprot_dump.txt \ wbConcise.txt wbGeneClass.txt \ /cluster/data/ce2/bed/orfToGene/orfToGene.tab2 sangerLinks.txt # create table in ce2 database and load data hgsql ce2 < ~/kent/src/hg/near/hgWormLinks/sangerLinks.sql hgsql -e \ 'load data local infile "sangerLinks.txt" into table sangerLinks' \ ce2 # CREATE SANGERPEP TABLE (DONE, 2004-04-29, hartera) # RECREATED SANGERPEP TABLE SO AS all.joiner EXPECTS THE CREATE TIME TO # BE LATER THAN THAT FOR THE SANGERGENE TABLE (DONE, 2004-05-21, hartera) mkdir -p /cluster/data/ce2/bed/sangerPep cd /cluster/data/ce2/bed/sangerPep # Download peptide sequences from the Sanger Centre ftp site: wget -o ce2.fetch.log -r -l1 --no-directories --timestamping \ ftp://ftp.sanger.ac.uk/pub/databases/wormpep/old_wormpep120/wormpep120 # Load into database # 2004-05-10 and 2004-05-11 hartera dropped sangerPep table and recreated it # all.joiner expects sangerGene table to have an earlier # load time than sangerPep so recreate sangerPep after sangerGene hgPepPred ce2 generic sangerPep wormpep120 # the sangerPep table is used by the sangerGene track # TWINSCAN GENE PREDICTIONS (DONE, 2004-04-16, hartera) # These are Iscan (new version of Twinscan) gene predictions # e-mailed Michael Brent at WUSTL: brent@cse.wustl.edu to obtain data ssh hgwdev mkdir /cluster/data/ce2/bed/twinscan cd /cluster/data/ce2/bed/twinscan wget --timestamping -r \ http://genes.cs.wustl.edu/predictions/worm/C_elegans/WS120/4-13-2004/ mv ./genes.cs.wustl.edu/predictions/worm/C_elegans/WS120/4-13-2004/* \ /cluster/data/ce2/bed/twinscan/ rm -r genes.cs.wustl.edu rm index* ./chr_gtf/index* ./chr_ptx/index* ./chr_tx/index* # Clean up chrom field of GTF files foreach c (I II III IV V X) set f = chr_gtf/chr_${c}.iscan_pred_gtf echo ${f} sed -e "s/CHROMOSOME_/chr/g" ${f} | \ sed -e "s/_[0-9][0-9]*.seq\t/\t/" > \ chr_gtf/chr${c}-fixed.gtf end # pare down protein FASTA header to id and add missing .1: foreach c (I II III IV V X) set f = chr_ptx/CHROMOSOME_${c}.ptx echo ${f} perl -wpe 's/^\>CHROMOSOME_(\S+).*$/\>chr$1.1/;' < \ ${f} > chr_ptx/chr${c}-fixed.fa end # load data into database, need -gtf option as no stop codons in # these predictions. also use new -frame -id and -geneName options ldHgGene -gtf -frame -id -geneName ce2 twinscan chr_gtf/chr*-fixed.gtf # Read 20990 transcripts in 147535 lines in 6 files # 20990 groups 6 seqs 1 sources 3 feature types # 20990 gene predictions # 2004-05-10, hartera dropped twinscanPep table and re-created it. # twinscan table was reloaded using extended options after loading # twinscanPep. all.joiner expects twinscan table to have an earlier # load time than twinscanPep hgPepPred ce2 generic twinscanPep chr_ptx/chr*-fixed.fa # MAKING BLASTZ SELF (DONE, 2004-04-20, hartera) # The procedure for lineage spec business with self is to simply # use the actual repeat masker output for this C. elegans assembly as # the lineage specific repeats for itself. Thus, merely make # symlinks to the repeat masker out files and name them as expected # for blastz. In this case they are called notinCelegans but they # really mean inCelegans. Yes, it is confusing, but that's just the # nature of the game in this case. # next machine ssh kkr1u00 mkdir -p /iscratch/i/worms/Celegans2/linSpecRep.notinCelegans cd /iscratch/i/worms/Celegans2/linSpecRep.notinCelegans foreach f (/iscratch/i/worms/Celegans2/bothMasksFa/*.fa) set base = $f:t:r:r echo $base.out.spec ln -s $f $base.out.spec end iSync # next machine ssh hgwdev mkdir -p /cluster/data/ce2/bed/blastzSelf cd /cluster/data/ce2/bed/blastzSelf cat << '_EOF_' > DEF # C. elegans vs. C. elegans export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386 ALIGN=blastz-run BLASTZ=blastz BLASTZ_H=2000 BLASTZ_ABRIDGE_REPEATS=1 # TARGET # C. elegans SEQ1_DIR=/iscratch/i/worms/Celegans2/nib # not used SEQ1_RMSK= # not used SEQ1_FLAG= SEQ1_SMSK=/iscratch/i/worms/Celegans2/linSpecRep.notinCelegans SEQ1_IN_CONTIGS=0 SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY # C. elegans SEQ2_DIR=/iscratch/i/worms/Celegans2/nib # not currently used SEQ2_RMSK= # not currently used SEQ2_FLAG= SEQ2_SMSK=/iscratch/i/worms/Celegans2/linSpecRep.notinCelegans SEQ2_IN_CONTIGS=0 SEQ2_CHUNK=10000000 SEQ2_LAP=10000 BASE=/cluster/data/ce2/bed/blastzSelf DEF=$BASE/DEF RAW=$BASE/raw CDBDIR=$BASE SEQ1_LEN=$BASE/S1.len SEQ2_LEN=$BASE/S2.len '_EOF_' # << this line makes emacs coloring happy # Save the DEF file in the current standard place cp DEF ~angie/hummus/DEF.ce2-ce2.2004-04-14 ssh kk cd /cluster/data/ce2/bed/blastzSelf # source the DEF file to establish environment for following commands source DEF /cluster/data/mm4/jkStuff/BlastZ_run0.sh cd run.0 para try, check, push, check, .... # para time # Completed: 196 of 196 jobs # CPU time in finished jobs: 418569s 6976.16m 116.27h 4.84d 0.013 y # IO & Wait Time: 2416s 40.26m 0.67h 0.03d 0.000 y # Average job time: 2148s 35.80m 0.60h 0.02d # Longest job: 17917s 298.62m 4.98h 0.21d # Submission to last job: 53181s 886.35m 14.77h 0.62d cd /cluster/data/ce2/bed/blastzSelf source DEF /cluster/data/ce2/jkStuff/BlastZ_run1.sh cd run.1 para try, check, push, etc ... # para time # Completed: 14 of 14 jobs # CPU time in finished jobs: 3340s 55.67m 0.93h 0.04d 0.000 y # IO & Wait Time: 98s 1.63m 0.03h 0.00d 0.000 y # Average job time: 246s 4.09m 0.07h 0.00d # Longest job: 439s 7.32m 0.12h 0.01d # Submission to last job: 892s 14.87m 0.25h 0.01d # Third cluster run to convert lav's to axt's # lavToAxt has a new -dropSelf option that replaces axtDropOverlap # to remove the alignment blocks on the diagonal # When this was first run, it crashed for chrV with the # error: breakUpIfOnDiagonal: Too many fragmented block lists! # This means that when a blockList is taken and broken up everywhere # there is a block along the diagonal and added to an array of # fragmented sub-lists then in this chrosomosome the list has exceeded # the array size of 256 for an alignment. Change the line: # struct block *bArr[256] to struct block *bArr[1024] in the # parseIntoAxt function, make and re-run cd /cluster/data/ce2/bed/blastzSelf mkdir axtChrom mkdir run.2 cd run.2 cat << '_EOF_' > do.csh #!/bin/csh cd $1 cat `ls -1 *.lav | sort -g` \ | ~/bin/i386/lavToAxt -dropSelf stdin /iscratch/i/worms/Celegans2/nib \ /iscratch/i/worms/Celegans2/nib stdout \ | axtSort stdin $2 '_EOF_' # << this line makes emacs coloring happy chmod a+x do.csh cat << '_EOF_' > gsub #LOOP ./do.csh {check in exists $(path1)} {check out line+ /cluster/data/ce2/bed/blastzSelf/axtChrom/$(root1).axt} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy ls -1Sd ../lav/chr* > chrom.list gensub2 chrom.list single gsub jobList # check jobList wc -l jobList head jobList para create jobList para try, check .... # only 7 jobs so all completed by para try para try, para check ...etc. # para time # Completed: 7 of 7 jobs # CPU time in finished jobs: 796s 13.27m 0.22h 0.01d 0.000 y # IO & Wait Time: 551s 9.18m 0.15h 0.01d 0.000 y # Average job time: 192s 3.21m 0.05h 0.00d # Longest job: 302s 5.03m 0.08h 0.00d # Submission to last job: 303s 5.05m 0.08h 0.00d # Use Blastz alignments to create Self Chain. Do not create # Self Blastz track since this is redundant with the Self Chain. # CHAIN SELF BLASTZ (DONE, 2004-05-03, hartera) # CHAINS ARE RE-DONE DUE TO BUG IN axtChain which has now been fixed by Jim # The axtChain is best run on the small kluster, or the kki kluster # in this case, it was run on the kk kluster ssh kk mkdir -p /cluster/data/ce2/bed/blastzSelf/axtChain.2004-04-30/run1 cd /cluster/data/ce2/bed/blastzSelf/axtChain.2004-04-30/run1 mkdir out chain ls -1S /cluster/data/ce2/bed/blastzSelf/axtChrom/*.axt \ > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out} #ENDLOOP '_EOF_' # << this line makes emacs coloring happy # Problems running this. At first, all jobs crashed and para problems said that # the ./out/*.out files were empty. Then copied nib files also to a nib2 # directory and change in doChain so the 2 nib arguments are not the same. # Then crashed saying that the ./chain/*.chain files were empty and that the # files in nib2 could not be opened (permissions ok). changed doChain back to # having the two nib directory arguments the same. Removed and recreated jobList# but left ./chain/*.chain and ./out/*.out # This time the jobs ran ok. Also runs ok if run axtFilter followed by # axtChain separately on the command line. cat << '_EOF_' > doChain #!/bin/csh axtFilter $1 | /cluster/home/hartera/bin/i386/axtChain stdin \ /iscratch/i/worms/Celegans2/nib /iscratch/i/worms/Celegans2/nib $2 >& $3 '_EOF_' # << this line makes emacs coloring happy chmod a+x doChain gensub2 input.lst single gsub jobList para create jobList # 7 jobs so all completed by para try para try, check,....etc. # after remaking the axt files using the -dropSelf option for lavToAxt, # these jobs no longer crashed. # para time # Completed: 7 of 7 jobs # CPU time in finished jobs: 2700s 45.00m 0.75h 0.03d 0.000 y # IO & Wait Time: 620s 10.34m 0.17h 0.01d 0.000 y # Average job time: 474s 7.90m 0.13h 0.01d # Longest job: 844s 14.07m 0.23h 0.01d # Submission to last job: 844s 14.07m 0.23h 0.01d # now on the file server, sort chains ssh eieio cd /cluster/data/ce2/bed/blastzSelf/axtChain.2004-04-30 chainMergeSort run1/chain/*.chain > all.chain chainSplit chain all.chain # take a look at score distr's foreach f (chain/*.chain) grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r echo $f:t:r textHistogram -binSize=10000 /tmp/score.$f:t:r echo "" end # for chrI # chrI # large values truncated: need 1414 bins or larger binSize than 10000 # 0 ************************************************************ 710660 # 10000 ******** 90988 # 20000 ** 25092 # 30000 * 10405 # 40000 5912 # 50000 2958 # 60000 2157 # 70000 1780 # 80000 1336 # 90000 900 # 100000 636 # 110000 579 # 120000 451 # 130000 403 # 140000 351 # 150000 337 # 160000 287 # 170000 210 # 180000 88 # 190000 129 # 200000 147 # 210000 80 # 220000 60 # 230000 73 # 240000 42 # trim to minScore=20000 to cut some of the fluff mkdir chainFilt foreach f (chain/*.chain) chainFilter -minScore=20000 $f > chainFilt/$f:t end # Load chains into database # next machine ssh hgwdev cd /cluster/data/ce2/bed/blastzSelf/axtChain.2004-04-30/chainFilt foreach i (*.chain) set c = $i:r echo loading $c hgLoadChain ce2 ${c}_chainSelf $i echo done $c end # MAKE DOWNLOADABLE SEQUENCE FILES (DONE, 2004-04-21, hartera) ssh kksilo cd /cluster/data/ce2 #- Build the .zip files cat << '_EOF_' > jkStuff/zipAll.csh rm -rf zip mkdir zip # chrom AGP's zip -j zip/chromAgp.zip ./sangerFa/[0-9A-Z]*/chr*.agp # chrom RepeatMasker out files zip -j zip/chromOut.zip ./RMRun/chr*.fa.out # soft masked chrom fasta zip -j zip/chromFa.zip ./softMask/chr*.fa # hard masked chrom fasta zip -j zip/chromFaMasked.zip hardMask/chr*.fa # chrom TRF output files cd bed/simpleRepeat zip ../../zip/chromTrf.zip trfMask/chr*.bed cd ../.. # get GenBank native mRNAs cd /cluster/data/genbank ./bin/i386/gbGetSeqs -db=ce2 -native GenBank mrna \ /cluster/data/ce2/zip/mrna.fa # get GenBank xeno mRNAs ./bin/i386/gbGetSeqs -db=ce2 -xeno GenBank mrna \ /cluster/data/ce2/zip/xenoMrna.fa # get native RefSeq mRNAs ./bin/i386/gbGetSeqs -db=ce2 -native refseq mrna \ /cluster/data/ce2/zip/refMrna.fa # get native GenBank ESTs ./bin/i386/gbGetSeqs -db=ce2 -native GenBank est \ /cluster/data/ce2/zip/est.fa cd /cluster/data/ce2/zip # zip GenBank native and xeno mRNAs, native ESTs and RefSeq mRNAs zip -j mrna.zip mrna.fa zip -j xenoMrna.zip xenoMrna.fa zip -j refMrna.zip refMrna.fa zip -j est.zip est.fa '_EOF_' # << this line makes emacs coloring happy csh ./jkStuff/zipAll.csh |& tee zipAll.log cd zip #- Look at zipAll.log to make sure all file lists look reasonable. # Make upstream files and Copy the .zip files to # hgwdev:/usr/local/apache/... ssh hgwdev cd /cluster/data/ce2/zip # make upstream files for C. elegans RefSeq featureBits ce2 refGene:upstream:1000 -fa=upstream1000.fa zip upstream1000.zip upstream1000.fa featureBits ce2 refGene:upstream:2000 -fa=upstream2000.fa zip upstream2000.zip upstream2000.fa featureBits ce2 refGene:upstream:5000 -fa=upstream5000.fa zip upstream5000.zip upstream5000.fa #- Check zip file integrity: foreach f (*.zip) unzip -t $f > $f.test tail -1 $f.test end wc -l *.zip.test set gp = /usr/local/apache/htdocs/goldenPath/ce2 mkdir -p $gp/bigZips cp -p *.zip $gp/bigZips mkdir -p $gp/chromosomes foreach f ( ../sangerFa/chr*.fa ) zip -j $gp/chromosomes/$f:t.zip $f end cd $gp/bigZips md5sum *.zip > md5sum.txt cd $gp/chromosomes md5sum *.zip > md5sum.txt # Take a look at bigZips/* and chromosomes/*, update their README.txt's # MAKE VSCB1 DOWNLOADS (DONE, 2004-05-11, hartera) # DO THIS LATER ssh kksilo cd /cluster/data/ce2/bed/blastz.cb1 zip -j /cluster/data/ce2/zip/Cb1axtChrom.zip axtChrom/chr*.axt cd /cluster/data/ce2/bed/blastz.cb1/axtChain.2004-04-29 cp all.chain cb1.chain zip -j /cluster/data/ce2/zip/cb1.chain.zip cb1.chain rm cb1.chain zip -j /cluster/data/ce2/zip/cb1.net.zip cb1.net # add axtNets cd /cluster/data/ce2/bed/blastz.cb1/axtNet.2004-04-29/sortedaxtNet foreach f (I II III IV V X M) cp chr${f}.axt chr{$f}.axtNet.axt end zip -j /cluster/data/ce2/zip/axtNetCb1.zip chr*.axtNet.axt rm *.axtNet.axt ssh hgwdev mkdir -p /usr/local/apache/htdocs/goldenPath/ce2/vsCb1 cd /usr/local/apache/htdocs/goldenPath/ce2/vsCb1 mv /cluster/data/ce2/zip/Cb1axtChrom.zip axtChrom.zip mv /cluster/data/ce2/zip/cb1*.zip . mv /cluster/data/ce2/zip/axtNetCb1.zip . md5sum *.zip > md5sum.txt # Copy over & edit README.txt w/pointers to chain, net formats. # MAKE VSSELF DOWNLOADS (DONE, 2004-05-11, hartera) ssh kksilo cd /cluster/data/ce2/bed/blastSelf zip -j /cluster/data/ce2/zip/SelfaxtChrom.zip axtChrom/chr*.axt cd /cluster/data/ce2/bed/blastzSelf/axtChain.2004-04-30 cp all.chain self.chain zip -j /cluster/data/ce2/zip/self.chain.zip self.chain rm self.chain ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/ce2/vsSelf cd /usr/local/apache/htdocs/goldenPath/ce2/vsSelf mv /cluster/data/ce2/zip/SelfaxtChrom.zip axtChrom.zip mv /cluster/data/ce2/zip/self.chain.zip . md5sum *.zip > md5sum.txt # Copy over & edit README.txt w/pointers to chain formats. # miRNA track (DONE - 2004-05-04 - Hiram) # data from: Sam Griffiths-Jones # and Michel.Weber@ibcg.biotoul.fr # notify them if this assembly updates to renew this track ssh hgwdev mkdir /cluster/data/ce2/bed/miRNA cd /cluster/data/ce2/bed/miRNA wget --timestamping \ "ftp://ftp.sanger.ac.uk/pub/databases/Rfam/miRNA/genomes/cel_ws120.*" grep -v "^track " cel_ws120.bed | sed -e "s/ /\t/g" > ce2.bed hgLoadBed ce2 miRNA ce2.bed # entry in trackDb/trackDb.ra already there # Compare with Ce1: # featureBits ce1 miRNA # 10329 bases of 100277784 (0.010%) in intersection # featureBits ce2 miRNA # 11382 bases of 100291761 (0.011%) in intersection # MAKE ALT. SPLICING TRACK (DONE, 2004-05-13, hartera) # create rnaCluster table and Gene Bounds track ssh hgwdev mkdir /cluster/data/ce2/bed/rnaCluster cd /cluster/data/ce2/bed/rnaCluster foreach f (I II III IV V X) echo "clusterRna ce2 /dev/null chr${f}.bed -chrom=${f}" clusterRna ce2 /dev/null chr${f}.bed -chrom=chr${f} end hgLoadBed ce2 rnaCluster *.bed hgsql -A -e "select chrom from chromInfo" ce2 | egrep -v chrom > chrom.tab (perl -e 'print "#\!/bin/sh\n"; while(<>) {chomp($_); printf "hgsql -A -e =select * from rnaCluster where chrom like \"$_\" order by chromStart, name= ce2 | egrep -v chromStart | cut -f2\-13 > $_.ce2.rnaCluster.bed\n";}' | sed -e "s/=/'/g") < chrom.tab > rnaCluster.sh chmod a+x rnaCluster.sh rnaCluster.sh mkdir -p /cluster/data/ce2/bed/altSplice/agx cd /cluster/data/ce2/bed/altSplice hgsql -A -e "select chrom from chromInfo" ce2 | egrep -v chrom > chrom.tab perl -e 'while(<>) {chomp($_); print "~/bin/i386/altSplice -db=ce2 -beds=../rnaCluster/$_.ce2.rnaCluster.bed -agxOut=agx/$_.ce2.rnaCluster.agx -consensus -weightMrna\n";}' < chrom.tab > altSplice.para.spec # these jobs are no big deal on the worm, just run this file: chmod +x altSplice.para.spec mkdir agx ./altSplice.para.spec # 0 genePredictions with 2610 clusters, 0 cassette exons, 0 of are not mod 3. cat agx/*.agx > altSplice.agx # UP TO HERE, make agxToBed and use local copy ~/bin/i386/agxToBed altSplice.agx altSplice.bed cat << '_EOF_' >> altGraphX.sql CREATE TABLE altGraphX ( bin smallint unsigned not null, tName varchar(255) NOT NULL default '', tStart int(11) NOT NULL default '0', tEnd int(11) NOT NULL default '0', name varchar(255) NOT NULL default '', id int(10) unsigned NOT NULL auto_increment, strand char(2) NOT NULL default '', vertexCount int(10) unsigned NOT NULL default '0', vTypes longblob NOT NULL, vPositions longblob NOT NULL, edgeCount int(10) unsigned NOT NULL default '0', edgeStarts longblob NOT NULL, edgeEnds longblob NOT NULL, evidence longblob NOT NULL, edgeTypes longblob NOT NULL, mrnaRefCount int(11) NOT NULL default '0', mrnaRefs longblob NOT NULL, mrnaTissues longblob NOT NULL, mrnaLibs longblob NOT NULL, PRIMARY KEY (id), KEY tName (tName(16),tStart,tEnd) ); '_EOF_' hgLoadBed -sqlTable=altGraphX.sql ce2 altGraphX altSplice.agx # trackDb entry: track altGraphX shortLabel Alt-Splice longLabel Alternative Splicing group rna priority 74.1 visibility pack type bed 3 canPack off ################################################################## # Create frames based page views of alt splice areas of interest # (DONE - 2004-05-16 - Hiram) ssh hgwdev mkdir /cluster/data/ce2/bed/altSplice/altAnalysis cd /cluster/data/ce2/bed/altSplice/altAnalysis altAnalysis ce2 ../altSplice.agx altSummary.txt links.html \ frame.html -bedName=all # creates files, (as well as several .bed files): # -rw-rw-r-- 1 282 May 16 09:19 frame.html # -rw-rw-r-- 1 391029 May 16 09:19 links.html # Copy these to the Intronerator WS120 location, change # default opening location: cp -p links.html /usr/local/apache/htdocs/IntronWS120 sed -e \ "s/Human Alt Splicing Conserved in Mouse/Alt Splicing in C. elegans/" \ -e "s/NM_005487/chrII:14689493-14690232/" \ -e "s/hgwdev-sugnet/genome-test/" \ frame.html > /usr/local/apache/htdocs/IntronWS120/frame.html # Better location is in the goldenPath itself 2006-01-12: mkdir /usr/local/apache/htdocs/goldenPath/ce2/altGraphX sed -e "s#http://hgwdev-sugnet.cse.ucsc.edu##" links.html \ > /usr/local/apache/htdocs/goldenPath/ce2/altGraphX/links.html sed -e \ "s/Human Alt Splicing Conserved in Mouse/Alt Splicing in C. elegans/" \ -e "s/NM_005487/chrII:14689493-14690232/" \ -e "s#http://hgwdev-sugnet.cse.ucsc.edu##" \ frame.html \ > /usr/local/apache/htdocs/goldenPath/ce2/altGraphX/frame.html # MAKE single coverage BEST track from blastz axt's (DONE - 2004-05-11 - Hiram) # This data is for use with the phyloHMM construction, below. ssh eieio cd /cluster/data/ce2 cat << '_EOF_' > jkStuff/mkBest.sh #!/bin/sh mkdir axtBest mkdir pslBest for c in I II III IV V X M do echo "axtBesting chr$c" /cluster/bin/i386/axtBest axtChrom/chr${c}.axt chr${c} \ axtBest/chr${c}.axt -minScore=300 echo "processing chr${c}.axt -> chr${c}_blastzBestCb1.psl" /cluster/bin/i386/axtToPsl axtBest/chr${c}.axt \ S1.len S2.len pslBest/chr${c}_blastzBestCb1.psl echo "Done: chr${c}_blastzBestRn3.psl" done '_EOF_' chmod +x jkStuff/mkBest.sh cd /cluster/data/ce2/bed/blastz.cb1 time ../../jkStuff/mkBest.sh # real 1m47.069s # user 1m1.880s # sys 0m14.240s # Load this table ssh hgwdev cd /cluster/data/ce2/bed/blastz.cb1/pslBest time /cluster/bin/i386/hgLoadPsl ce2 -table=blastzBestCb1 \ chr*_blastzBestCb1.psl # real 0m21.775s # user 0m8.510s # sys 0m1.500s # MAKE phyloHMM data # (redone using new phastCons, acs 9/14/04) # (redone using nets rather than axtBest, acs 10/10/04) ssh hgwdev mkdir /cluster/data/ce2/bed/pHMM cd /cluster/data/ce2/bed/pHMM mkdir axt cd axt ln -s /cluster/data/ce2/bed/blastz.cb1/axtNet.2004-04-29/sortedaxtNet/*.axt . cd .. mkdir maf # create maf files from axtBest axt files for c in I II III IV M V X do axtToMaf axt/chr${c}.axt tSizes qSizes maf/chr${c}.Tmaf sed -e "s/chrUn/cb1.chrUn/" maf/chr${c}.Tmaf | sed -e "s/ chr/ ce2.chr/" > \ maf/chr${c}.maf rm -f maf/chr${c}.Tmaf echo "done chr${c}" done # split up alignments; no need to use cluster with worm mkdir -p /cluster/bluearc/ce2/phastCons/WINDOWS mkdir -p /scratch/phastCons.ce2.09-14-04.WINDOWS for i in I II III IV V X M; do \ msa_split maf/chr${i}.maf -i MAF -M /cluster/data/ce2/sangerFa/chr${i}.fa \ -w 1000000,0 -r /scratch/phastCons.ce2.09-14-04.WINDOWS/chr${i} -o SS \ -I 1000 -B 5000 ;\ done for file in /scratch/phastCons.ce2.09-14-04.WINDOWS/* ; do \ echo $file ;\ gzip -c $file > /cluster/bluearc/ce2/phastCons/WINDOWS/`basename $file`.gz ;\ done rm -rf /scratch/phastCons.ce2.09-14-04.WINDOWS # produce rough starting model msa_view -i MAF --aggregate ce2,cb1 -o SS -z maf/chr*.maf > all.ss phyloFit -i SS all.ss -o starting-tree # also get average GC content of aligned regions msa_view -i SS all.ss --summary-only # descrip. A C G T G+C length all_gaps some_gaps # all.ss 0.3130 0.1872 0.1869 0.3129 0.3741 58909311 0 11236473 # now set up cluster job to estimate model parameters. See # makeHg17.doc for add'l details cat << 'EOF' > doEstimate.sh #!/bin/sh zcat $1 | /cluster/bin/phast/phastCons - starting-tree.mod --gc 0.3741 --nrates 1,1 --no-post-probs --ignore-missing --expected-lengths 25 --target-coverage 0.45 --quiet --log $2 --estimate-trees $3 EOF chmod u+x doEstimate.sh # (target of .45 is based on target of 0.2 and about 45% alignment # coverage: 0.2/0.45 is approx. 0.45) rm -fr LOG TREES mkdir -p LOG TREES rm -f jobs.lst for f in /cluster/bluearc/ce2/phastCons/WINDOWS/*.ss.gz ; do root=`basename $f .ss.gz` ;\ echo doEstimate.sh $f LOG/$root.log TREES/$root >> jobs.lst ;\ done # run cluster job ssh kk... para create ... # (takes about 2 minutes) # Now combine parameter estimates. ls TREES/*.cons.mod > cons.txt phyloBoot --read-mods '*cons.txt' --output-average ave.cons.mod > cons_summary.txt ls TREES/*.noncons.mod > noncons.txt phyloBoot --read-mods '*noncons.txt' --output-average ave.noncons.mod > noncons_summary.txt # Now set up cluster job for computing consevation scores and predicted elements cat << 'EOF' > doPhastCons.sh #!/bin/sh mkdir -p /cluster/bluearc/ce2/phastCons/POSTPROBS /cluster/bluearc/ce2/phastCons/ELEMENTS pref=`basename $1 .ss.gz` chr=`echo $pref | awk -F\. '{print $1}'` tmpfile=/scratch/phastCons.$$ zcat $1 | /cluster/bin/phast/phastCons - ave.cons.mod,ave.noncons.mod --expected-lengths 25 --target-coverage 0.45 --quiet --seqname $chr --idpref $pref --viterbi /cluster/bluearc/ce2/phastCons/ELEMENTS/$pref.bed --score --require-informative 0 > $tmpfile gzip -c $tmpfile > /cluster/bluearc/ce2/phastCons/POSTPROBS/$pref.pp.gz rm $tmpfile EOF chmod u+x doPhastCons.sh rm -fr /cluster/bluearc/ce2/phastCons/POSTPROBS /cluster/bluearc/ce2/phastCons/ELEMENTS rm -f jobs2.lst for f in /cluster/bluearc/ce2/phastCons/WINDOWS/*.ss.gz ; do echo doPhastCons.sh $f >> jobs2.lst ; done # run cluster job ssh kk... para create, ... # (takes about 30 sec) # set up phastConsElements track awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' /cluster/bluearc/ce2/phastCons/ELEMENTS/*.bed > all.raw.bed /cluster/bin/scripts/lodToBedScore all.raw.bed > all.bed hgLoadBed ce2 phastConsElements all.bed featureBits ce2 phastConsElements # 20217282 bases of 100291761 (20.158%) in intersection # set up wiggle mkdir wib for i in I II III IV V X M; do \ zcat `ls /cluster/bluearc/ce2/phastCons/POSTPROBS/chr${i}.*.pp.gz | sort -t\. -k2,2n` | \ wigAsciiToBinary -chrom=chr${i} -wibFile=wib/chr${i}_phastCons stdin ;\ done hgLoadWiggle ce2 phastCons -pathPrefix=/gbdb/ce2/phastCons/wib wib/chr*_phastCons.wig mkdir -p /gbdb/ce2/phastCons/wib rm -f /gbdb/ce2/phastCons/wib/chr*phastCons.wib ln -s /cluster/data/ce2/bed/pHMM/wib/*.wib /gbdb/ce2/phastCons/wib chmod 775 . wib /gbdb/ce2/phastCons /gbdb/ce2/phastCons/wib chmod 664 wib/*.wib # move postprobs over and clean up bluearc rsync -av /cluster/bluearc/ce2/phastCons/POSTPROBS . # (people sometimes want the raw scores) rm -r /cluster/bluearc/ce2/phastCons/ELEMENTS /cluster/bluearc/ce2/phastCons/POSTPROBS # set up full alignment/conservation track # note that in this case the pairwise mafs are used both for the # "multiple" alignment and the pairwise alignments mkdir -p /gbdb/ce2/c_briggsae_pwMaf cd maf ln -s `pwd`/*.maf /gbdb/ce2/c_briggsae_pwMaf hgLoadMaf ce2 -warn c_briggsae_pwMaf -pathPrefix=/gbdb/ce2/c_briggsae_pwMaf chmod 775 /gbdb/ce2/c_briggsae_pwMaf . chmod 664 *.maf # trackDb entry: # track c_briggsae_pwMaf # shortLabel Conservation # longLabel Blastz Alignments with C. Briggsae & Conservation # group compGeno # priority 100 # visibility pack # type wigMaf 0.0 1.0 # maxHeightPixels 100:40:11 # wiggle phastCons # yLineOnOff Off # autoScale Off # pairwise pwMaf # speciesOrder c_briggsae ######## MAKING GENE SORTER TABLES #### (DONE - 2004-05-27 - hartera) # These are instructions for building the # neighborhood browser. Don't start these until # there is a sangerGene table with a proteinID column containing Swiss-Prot # protein IDs, and also Kim lab expression data is required (in hgFixed). # Cluster together various alt-splicing isoforms. # Creates the sangerIsoforms and sangerCanonical tables # (DONE, 2004-05-21, hartera) ssh hgwdev hgClusterGenes ce2 sangerGene sangerIsoforms sangerCanonical # You may need to build this binary in src/hg/near/hgClusterGenes # Got 20713 clusters, from 23076 genes in 7 chromosomes # featureBits ce2 sangerCanonical # 54156601 bases of 100291761 (53.999%) in intersection # featureBits ce1 sangerCanonical # 53654286 bases of 100277784 (53.506%) in intersection # Create Self blastp table - sangerBlastTab (DONE, 2004-05-24, hartera) # Reload blastp data into sangerBlastTab and drop knownBlastTab # - table given the wrong name (DONE, 2004-05-28, hartera) # Get sangerPep peptide sequences fasta file from sangerPep dir # and create a blast database out of them. mkdir -p /cluster/data/ce2/bed/blastp cd /cluster/data/ce2/bed/blastp mv /cluster/data/ce2/bed/sangerPep/wormPep120 wormPep120.faa formatdb -i wormPep120.faa -t wormPep120 -n wormPep120 # This command is in /projects/compbio/bin/$MACH/formatdb # Copy database over to iscratch ssh kkr1u00 mkdir -p mkdir -p /iscratch/i/ce2/blastp cp /cluster/data/ce2/bed/blastp/wormPep120.* /iscratch/i/ce2/blastp iSync # Blastall and Data directory are already in /iscratch/i/blast/ # From the dates, this looks to be the same versio as in # /projects/compbio/bin/i686/ # Split up fasta file into bite sized chunks for cluster cd /cluster/data/ce2/bed/blastp mkdir split faSplit sequence wormPep120.faa 8000 split/wp # Make parasol run directory ssh kk mkdir -p /cluster/data/ce2/bed/blastp/self cd /cluster/data/ce2/bed/blastp/self mkdir run cd run mkdir out # Make blast script cat << '_EOF_' > blastSome #!/bin/csh setenv BLASTMAT /iscratch/i/blast/data /iscratch/i/blast/blastall -p blastp -d /iscratch/i/ce2/blastp/wormPep120 \ -i $1 -o $2 -e 0.01 -m 8 -b 1000 '_EOF_' chmod a+x blastSome # Make gensub2 file cat << '_EOF_' > gsub #LOOP blastSome {check in line+ $(path1)} {check out line out/$(root1).tab} #ENDLOOP '_EOF_' # Create parasol batch # 'ls ../../split/*.fa' is too long an argument list, hence the echo echo ../../split/*.fa | wordLine stdin > split.lst gensub2 split.lst single gsub jobList para create jobList para try # Wait a minute, and do a para check, if all is good # then do a para push, para check etc. # para time # Completed: 6684 of 6684 jobs # CPU time in finished jobs: 35657s 594.28m 9.90h 0.41d 0.001 y # IO & Wait Time: 27262s 454.37m 7.57h 0.32d 0.001 y # Average job time: 9s 0.16m 0.00h 0.00d # Longest job: 193s 3.22m 0.05h 0.00d # Submission to last job: 302s 5.03m 0.08h 0.00d # Load into database. ssh hgwdev cd /cluster/data/ce2/bed/blastp/self/run/out hgLoadBlastTab ce2 sangerBlastTab *.tab # CREATE EXPRESSION DISTANCE TABLES FOR # KIM LAB EXPRESSION DATA (DONE, 2004-05-24, hartera) hgExpDistance ce2 hgFixed.kimWormLifeMedianRatio \ hgFixed.kimWormLifeMedianExps kimExpDistance # CREATE TABLE TO MAP BETWEEN SANGERGENES AND REFSEQ GENES # sangerToRefSeq (DONE, 2004-05-24, hartera) hgMapToGene ce2 refGene sangerGene sangerToRefSeq # CREATE TABLE TO MAP BETWEEN SANGER GENES AND PFAM DOMAINS # sangerToPfam (DONE, 2004-05-24, hartera) # Drop table knownToPfam - wrong table name. # Reload data into sangerToPfam (DONE, 2004-05-28, hartera) hgMapViaSwissProt ce2 sangerGene name proteinID Pfam sangerToPfam # CREATE MAPPING OF SANGER GENES TO KIM LAB EXPRESSION DATA GENES # sangerToKim (DONE, 2004-05-27, hartera) # This is actually a mapping of the sangerGene table to itself - # this is how this table was created for ce1. # This means that many gene names (4614/19134) in kimWormLifeAllRatio # do not appear in sangerGene but when randomly checking some of these, # it was found they are not in WS120 WormBase and probably represent # alternate names or names that no longer exist or withdrawn sequences. hgMapToGene ce2 sangerGene sangerGene sangerToKim # SET dbDb TABLE ENTRY FOR GENE SORTER (DONE, 2004-05-25, hartera) # set hgNearOk to 1 on hgcentraltest to make Ce2 Gene Sorter viewable hgsql -h hgwbeta -e 'update dbDb set hgNearOk = 1 where name = "Ce2";' \ hgcentraltest # MITOPRED DATA FOR HGGENE (DONE 8/10/04 angie) ssh hgwdev mkdir /cluster/data/ce2/bed/mitopred cd /cluster/data/ce2/bed/mitopred wget http://mitopred.sdsc.edu/data/cel_30.out perl -wpe 's/^(\S+)\s+\S+\s+(.*)/$1\t$2/' cel_30.out > mitopred.tab cat > mitopred.sql << '_EOF_' # Prediction of nuclear-encoded mito. proteins from http://mitopred.sdsc.edu/ CREATE TABLE mitopred ( name varchar(10) not null, # SwissProt ID confidence varchar(8) not null, # Confidence level #Indices PRIMARY KEY(name(6)) ); '_EOF_' hgsql ce2 < mitopred.sql hgsql ce2 -e 'load data local infile "mitopred.tab" into table mitopred' # MAKE Human Proteins track ssh eieio mkdir -p /cluster/data/ce2/blastDb cd /cluster/data/ce2/blastDb for i in ../sangerFa/*.fa; do ln -s $i .; done for i in *.fa; do formatdb -i $i -p F 2> /dev/null; done rm *.fa *.log ssh kkr1u00 mkdir -p /iscratch/i/ce2/blastDb cp /cluster/data/ce2/blastDb/* /iscratch/i/ce2/blastDb (~kent/bin/iSync) 2>&1 > sync.out mkdir -p /cluster/data/ce2/bed/tblastn.hg16KG cd /cluster/data/ce2/bed/tblastn.hg16KG ls -1S /iscratch/i/ce2/blastDb/*.nsq | sed "s/\.nsq//" > worm.lst exit # back to eieio cd /cluster/data/ce2/bed/tblastn.hg16KG mkdir kgfa # calculate a reasonable number of jobs calc `wc /cluster/data/hg16/bed/blat.hg16KG/hg16KG.psl | awk "{print \\\$1}"`/\(150000/`wc worm.lst | awk "{print \\\$1}"`\) # 41117/(150000/578) = 158.437507 split -l 158 /cluster/data/hg16/bed/blat.hg16KG/hg16KG.psl kgfa/kg cd kgfa for i in *; do pslxToFa $i $i.fa; rm $i; done cd .. ls -1S kgfa/*.fa > kg.lst mkdir blastOut for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/iscratch/i/blast/data export BLASTMAT f=/tmp/`basename $3` for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg16/bed/blat.hg16KG/protein.lft warn $f.2 mv $3.tmp $3 rm -f $f.1 $f.2 exit 0 fi fi rm -f $f.1 $f.2 $3.tmp $f.8 exit 1 '_EOF_' chmod +x blastSome gensub2 worm.lst kg.lst blastGsub blastSpec ssh kk cd /cluster/data/ce2/bed/tblastn.hg16KG para create blastSpec para shove # jobs will need to be restarted, but they should all finish # Completed: 150858 of 150858 jobs # CPU time in finished jobs: 10404860s 173414.33m 2890.24h 120.43d 0.330 y # IO & Wait Time: 2468643s 41144.06m 685.73h 28.57d 0.078 y # Average job time: 85s 1.42m 0.02h 0.00d # Longest job: 411s 6.85m 0.11h 0.00d # Submission to last job: 26364s 439.40m 7.32h 0.31d cat << '_EOF_' > chainGsub #LOOP chainSome $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainSome (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=10000 stdin ../c.`basename $1`.psl) '_EOF_' chmod +x chainSome ls -1dS `pwd`/blastOut/kg?? > chain.lst gensub2 chain.lst single chainGsub chainSpec ssh kki cd /cluster/data/ce2/bed/tblastn.hg16KG para create chainSpec para push # Completed: 261 of 261 jobs # CPU time in finished jobs: 828s 13.81m 0.23h 0.01d 0.000 y # IO & Wait Time: 19521s 325.34m 5.42h 0.23d 0.001 y # Average job time: 78s 1.30m 0.02h 0.00d # Longest job: 270s 4.50m 0.07h 0.00d # Submission to last job: 3878s 64.63m 1.08h 0.04d exit # back to eieio cd /cluster/data/ce2/bed/tblastn.hg16KG/blastOut for i in kg?? do awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.psl > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done cat u.*.psl m60* | liftUp -nosort -type=".psl" -nohead stdout /cluster/data/ce2/jkStuff/liftAll.lft warn stdin | \ sort -T /tmp -k 14,14 -k 16,16n -k 17,17n | uniq > ../preblastHg16KG.psl cd .. blatDir=/cluster/data/hg16/bed/blat.hg16KG protDat -kg preblastHg16KG.psl $blatDir/hg16KG.psl $blatDir/kg.mapNames blastHg16KG.psl ssh hgwdev cd /cluster/data/ce2/bed/tblastn.hg16KG hgLoadPsl ce2 blastHg16KG.psl exit # back to eieio rm -rf blastOut # End tblastn Human # MAKE Drosophila Proteins track ssh eieio mkdir -p /cluster/data/ce2/blastDb cd /cluster/data/ce2/blastDb for i in ../sangerFa/*.fa; do ln -s $i .; done for i in *.fa; do formatdb -i $i -p F 2> /dev/null; done rm *.fa *.log ssh kkr1u00 mkdir -p /iscratch/i/ce2/blastDb cp /cluster/data/ce2/blastDb/* /iscratch/i/ce2/blastDb (~kent/bin/iSync) 2>&1 > sync.out mkdir -p /cluster/data/ce2/bed/tblastn.dm1FB cd /cluster/data/ce2/bed/tblastn.dm1FB ls -1S /iscratch/i/ce2/blastDb/*.nsq | sed "s/\.nsq//" > worm.lst exit # back to eieio cd /cluster/data/ce2/bed/tblastn.dm1FB mkdir fbfa split -l 40 /cluster/data/dm1/bed/blat.dm1FB/dm1FB.psl fbfa/fb cd fbfa for i in *; do pslxToFa $i $i.fa; rm $i; done cd .. ls -1S fbfa/*.fa > fb.lst mkdir blastOut for i in `cat fb.lst`; do mkdir blastOut/`basename $i .fa`; done tcsh cat << '_EOF_' > blastGsub #LOOP blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } #ENDLOOP '_EOF_' cat << '_EOF_' > blastSome #!/bin/sh BLASTMAT=/iscratch/i/blast/data export BLASTMAT g=`basename $2` f=/tmp/`basename $3`.$g for eVal in 1 0.1 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11 do if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8 then mv $f.8 $f.1 break; fi done if test -f $f.1 then if /cluster/bin/i386/blastToPsl $f.1 $f.2 then liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /iscratch/i/dm1/protein.lft warn $f.2 mv $3.tmp $3 rm -f $f.1 $f.2 exit 0 fi fi rm -f $f.1 $f.2 $3.tmp exit 1 '_EOF_' chmod +x blastSome gensub2 worm.lst fb.lst blastGsub blastSpec exit ssh kk cd /cluster/data/ce2/bed/tblastn.dm1FB para create blastSpec para push # Completed: 3283 of 3283 jobs # CPU time in finished jobs: 3063561s 51059.35m 850.99h 35.46d 0.097 y # IO & Wait Time: 25333s 422.22m 7.04h 0.29d 0.001 y # Average job time: 941s 15.68m 0.26h 0.01d # Longest job: 3435s 57.25m 0.95h 0.04d # Submission to last job: 9387s 156.45m 2.61h 0.11d cat << '_EOF_' > chainGsub #LOOP chainSome $(path1) #ENDLOOP '_EOF_' cat << '_EOF_' > chainSome (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=25000 stdin ../c.`basename $1`.psl) '_EOF_' # << for emacs chmod +x chainSome ls -1dS `pwd`/blastOut/fb?? > chain.lst gensub2 chain.lst single chainGsub chainSpec ssh kki cd /cluster/data/ce2/bed/tblastn.dm1FB para create chainSpec para push # Completed: 469 of 469 jobs # CPU time in finished jobs: 721s 12.02m 0.20h 0.01d 0.000 y # IO & Wait Time: 1562s 26.03m 0.43h 0.02d 0.000 y # Average job time: 5s 0.08m 0.00h 0.00d # Longest job: 42s 0.70m 0.01h 0.00d # Submission to last job: 5546s 92.43m 1.54h 0.06d exit # back to eieio cd /cluster/data/ce2/bed/tblastn.dm1FB/blastOut for i in fb?? do awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.psl > c60.$i.psl sort -rn c60.$i.psl | pslUniq stdin u.$i.psl awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl echo $i done cat u.*.psl m60* | sort -T /tmp -k 14,14 -k 16,16n -k 17,17n | uniq > ../blastDm1FB.psl cd .. ssh hgwdev cd /cluster/data/ce2/bed/tblastn.dm1FB hgLoadPsl ce2 blastDm1FB.psl exit # back to eieio rm -rf blastOut # End tblastn # MAF COVERAGE FIGURES FOR ADAM (DONE 10/18/04 angie) # First, get ranges of target coverage: ssh eieio mkdir /cluster/data/ce2/bed/pHMM/coverage cd /cluster/data/ce2/bed/pHMM/maf.new.axtNet cat *.maf | nice mafRanges -notAllOGap stdin ce2 \ /cluster/data/ce2/bed/pHMM/coverage/ce2.mafRanges.bed ssh hgwdev cd /cluster/data/ce2/bed/pHMM/coverage # To make subsequent intersections a bit quicker, output a bed with # duplicate/overlapping ranges collapsed: nice featureBits ce2 ce2.mafRanges.bed \ -bed=ce2.mafRangesCollapsed.bed #43967098 bases of 100291761 (43.839%) in intersection # mafCoverage barfs currently, so pass on this for now: #cat ../maf.new.axtNet/*.maf \ #| nice mafCoverage -count=2 ce2 stdin > ce2.mafCoverage # Intersect maf target coverage with gene regions -- # use Adam's files which are a union of sangerGene and refGene: nice featureBits ce2 -enrichment \ /cluster/data/hg17/bed/var_multiz.2004-08-12/phastCons/stats2/wormCds.bed \ ce2.mafRangesCollapsed.bed \ -bed=ce2.mafCds.bed #wormCds.bed 25.701%, ce2.mafRangesCollapsed.bed 43.839%, both 20.701%, cover 80.55%, enrich 1.84x nice featureBits ce2 -enrichment \ /cluster/data/hg17/bed/var_multiz.2004-08-12/phastCons/stats2/wormUtr3.bed \ ce2.mafRangesCollapsed.bed \ -bed=ce2.mafUtr3.bed #wormUtr3.bed 0.323%, ce2.mafRangesCollapsed.bed 43.839%, both 0.216%, cover 66.86%, enrich 1.53x nice featureBits ce2 -enrichment \ /cluster/data/hg17/bed/var_multiz.2004-08-12/phastCons/stats2/wormUtr5.bed \ ce2.mafRangesCollapsed.bed \ -bed=ce2.mafUtr5.bed #wormUtr5.bed 0.057%, ce2.mafRangesCollapsed.bed 43.839%, both 0.041%, cover 72.37%, enrich 1.65x # syntenic net with cb1 (acs, 11/18/04) cd /cluster/data/ce2/bed/blastz.cb1/axtChain.2004-04-29 netFilter -syn cb1.net > cb1Syn.net netFilter -minGap=10 cb1Syn.net | hgLoadNet ce2 syntenyNetCb1 stdin # (add trackDb.ra entry) # [acs@hgwdev axtChain.2004-04-29]$ featureBits ce2 netCb1 # 88355987 bases of 100291761 (88.099%) in intersection # [acs@hgwdev axtChain.2004-04-29]$ featureBits ce2 syntenyNetCb1 # 73427463 bases of 100291761 (73.214%) in intersection # Hmmm -- looking in browser, seems a bit stringent for worm... # CLEANUP CB1 BLASTZ (DONE, 2004-02-24, hartera) ssh eieio cd /cluster/data/ce2/bed/blastz.cb1 nice rm axtChain/run1/chain/* & nice rm -fr axtChain/n1 axtChain/hNoClass.net & nice gzip axtChrom/* pslChrom/* axtChain/all.chain axtChain/cb1Filt.net axtChain/*.net & nice rm -fr axtChain/chain axtChain/preNet & nice rm -rf raw & nice rm -rf lav & # CLEANUP SELF BLASTZ (DONE, 2004-02-24, hartera) ssh eieio cd /cluster/data/ce2/bed/blastzSelf # remove old chains nice rm -fr axtChain & nice rm axtChain.2004-04-30/run1/chain/* & nice gzip axtChrom/* pslChrom/* axtChain.2004-04-30/all.chain axtChain.2004-04-30/chainFilt/* axtChain.2004-04-30/chain/* & nice rm -rf raw & nice rm -rf lav & ########################################################################### # LOAD GENEID PREDICTIONS (DONE 3/15/05) ssh eieio mkdir /cluster/data/ce2/bed/geneid cd /cluster/data/ce2/bed/geneid foreach chr (chrI chrII chrIII chrIV chrM chrV chrX) wget \ http://genome.imim.es/genepredictions/C.elegans/Ce200403/geneid_v1.2/$chr.gtf wget \ http://genome.imim.es/genepredictions/C.elegans/Ce200403/geneid_v1.2/$chr.prot end # Add ".1" suffix to each item in .prot's, to match transcript_id's in gtf perl -wpe 's/^(>\S+)/$1.1/' *.prot > geneid.fa ssh hgwdev cd /cluster/data/ce2/bed/geneid ldHgGene -gtf -genePredExt ce2 geneid *.gtf hgPepPred ce2 generic geneidPep geneid.fa ########################################################################### # DONE - 2006-01-06 - Hiram # Obtained a photograph from Professor Mark Blaxter # mark.blaxter@ed.ac.uk - http://nema.cap.ed.ac.uk/index.html # Saved image from email to: # /cluster/data/ce2/html/C.elegans.jpg # -rw-rw-r-- 1 227317 Jan 6 09:40 C.elegans.jpg # Transform image to a size appropriate for our WEB pages: ssh hgwdev cd /cluster/data/ce2/html convert -normalize -sharpen 0 -geometry 300x200 \ C.elegans.jpg Caenorhabditis_elegans.jpg cp -p Caenorhabditis_elegans.jpg /usr/local/apache/htdocs/images # Add IMG tag pointer to this image in the description.html page # and for ce1 and ce3 ######################################################################## ### microRNA targets tracks (DONE - 2006-03-29 - 2006-04-26 - Hiram) ### from: http://pictar.bio.nyu.edu/ Rajewsky Lab ### Nikolaus Rajewsky nr@scarbo.bio.nyu.edu ### Yi-Lu Wang ylw205@nyu.edu ### dg@thp.Uni-Koeln.DE ssh hgwdev mkdir /cluster/data/ce2/bed/picTar cd /cluster/data/ce2/bed/picTar wget --timestamping \ 'http://pictar.bio.nyu.edu/ucsc/new_single_elegans_bed' \ -O newSingleElegans.bed grep -v "^track" newSingleElegans.bed \ | hgLoadBed -strict ce2 picTar stdin # Loaded 11987 elements of size 9 nice -n +19 featureBits ce2 picTar # 39233 bases of 100291761 (0.039%) in intersection ######################################################################## # CLEANUP OF CB1 BLASTZ (DONE, 2007-06-25, hartera) ssh kkstore02 cd /cluster/store5/worm/ce2/bed/blastz.cb1.2004-04-12/axtChain rm -r cb1NetSplit cb1NetSplit1 tmp1 cd .. rm -r axtChain.2004-04-29test # Looks like new chians were made on 2004-04-29 cd /cluster/store5/worm/ce2/bed/blastz.cb1.2004-04-12/axtChain.2004-04-29 # remove the split net and chain directories as these can easily be # created from the all.chain and cb1.net files rm -r chain cb1NetSplit run1/chain # gzip files gzip all.chain *.net cd /cluster/store5/worm/ce2/bed/blastz.cb1.2004-04-12/ rm -r axtNet1 cd axtNet.2004-04-29 # remove *.axt files as these are also in the sortedaxtNet directory rm *.axt sortedaxtNet/axtNetCb1.tab cd /cluster/store5/worm/ce2/bed/blastz.cb1.2004-04-12/axtChrom/maf gzip sortedaxtNet/*.axt ####################################################################### ## LIFTOVER To Ce4 (DONE - 2007-07-16 Hiram) ssh kkr1u00 cd /cluster/data/ce4 mkdir nib cd nib for C in I II III IV V X M do faToNib -softMask ../goldenPath/chromosomes/chr${C}.fa.gz chr${C}.nib done # Writing 15072419 bases in 7536218 bytes # Writing 15279316 bases in 7639666 bytes # Writing 13783681 bases in 6891849 bytes # Writing 17493784 bases in 8746900 bytes # Writing 20919398 bases in 10459707 bytes # Writing 17718852 bases in 8859434 bytes # Writing 13794 bases in 6905 bytes $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-split.csh \ ce4 /cluster/data/ce4/nib # as it says, DO THIS NEXT: ssh kk # if bin/scripts is not in your PATH, add it for this command: PATH=$PATH:/cluster/bin/scripts \ $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-align.csh \ ce2 /cluster/data/ce2/nib ce4 /iscratch/i/ce4/split10k \ /cluster/data/ce4/jkStuff/11.ooc # as it says, DO THIS NEXT: cd /cluster/data/ce2/bed/blat.ce4.2007-07-16/run para try, check, push, check, ... # Completed: 49 of 49 jobs # CPU time in finished jobs: 14998s 249.97m 4.17h 0.17d 0.000 y # IO & Wait Time: 405s 6.74m 0.11h 0.00d 0.000 y # Average job time: 314s 5.24m 0.09h 0.00d # Longest finished job: 706s 11.77m 0.20h 0.01d # Submission to last job: 728s 12.13m 0.20h 0.01d # as it says, DO THIS NEXT: # this does the liftUp and makes the psl files # kkr1u00 is down at this time, fixup this script to work on kkr3u00 ssh kkr3u00 cd /cluster/data/ce2/bed ln -s blat.ce4.2007-07-16 ce4 time $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-lift.csh ce2 ce4 # real 1m1.780s # as it says, DO THIS NEXT: # the prepares the batch to run for the chaining ssh kki time $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-chain.csh \ ce2 /cluster/data/ce2/nib ce4 /cluster/data/ce4/nib # as it says, DO THIS NEXT: # running the chain batch cd /cluster/data/ce2/bed/blat.ce4.2007-07-16/chainRun para try, check, push, check, ... # Completed: 7 of 7 jobs # CPU time in finished jobs: 19s 0.31m 0.01h 0.00d 0.000 y # IO & Wait Time: 17s 0.29m 0.00h 0.00d 0.000 y # Average job time: 5s 0.09m 0.00h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 7s 0.12m 0.00h 0.00d # Submission to last job: 7s 0.12m 0.00h 0.00d ssh kkstore02 $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-net.csh ce2 ce4 # Created /cluster/data/ce2/bed/liftOver/ce2ToCe4.over.chain.gz # as it says, DO THIS NEXT: ssh hgwdev $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-load.csh ce2 ce4 # It says this: # Now, add link for # /usr/local/apache/htdocs/goldenPath/ce2/liftOver/ce2ToCe4.over.chain # to hgLiftOver # But I believe that link was already done: cd /gbdb/ce2/liftOver ls -og ce2ToCe4* # lrwxrwxrwx 1 53 Jun 5 16:35 ce2ToCe4.over.chain.gz -> # /cluster/data/ce2/bed/liftOver/ce2ToCe4.over.chain.gz ################################################ # AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd) update genbank.conf: ce2.upstreamGeneTbl = refGene