# for emacs: -*- mode: sh; -*- # This file describes how to make the browser database for the # worm C. elegans # Currently 2003-05-28 this file is in a state of flux as it is being # worked out. DOWNLOAD SEQUENCE (DONE 2003-05-12 - Hiram) # next machine ssh eieio mkdir -p /cluster/store5/worm/ce1/sanger mkdir -p /cluster/store5/worm/ce1/tmp cd /cluster/store5/worm/ce1/sanger wget -o ce1.fetch.log -r -l1 --no-directories \ ftp://ftp.sanger.ac.uk/pub/wormbase/current_release/CHROMOSOMES/ # Takes about five minutes # This current_release is updated every two weeks. Although the # sequence is quite stable at this time and changes very little. # Check that you have some valid files. Should be chroms I II III IV V X # in dna, gff and agp formats ls -ogrt # get out of this download data directory, and create your home symlink # shortcut cd .. ln -s /cluster/store5/worm/ce1 ~/ce1 cd ~/ce1 # There is a Mitochondria sequence. It can be found at NCBI # This search sequence at NCBI in 'nucleotide' selects one # entry: X54252: # Caenorhabditis Elegans[Organism] AND mitochondria AND MTCE # I'd like to figure out a wget command to fetch this, but doing # that manually via the WEB browser is not such a big deal. And # besides, once it is fetched, it is stable (haven't seen it # change in years) so you can # merely copy from a previous Ce build. A copy has been manually # placed in ~/ce1/ncbi/x54252.fa # # translate to unzipped .fa, all upper case, and # rename the agp files so hgGoldGap can find them mkdir sangerFa sed -e "s/^>.*/>chrM/" ./ncbi/x54252.fa > sangerFa/chrM.fa foreach f (I II III IV V X) echo -n "${f} " zcat sanger/CHROMOSOME_${f}.dna.gz | tr '[a-z]' '[A-Z]' | \ sed -e "s/CHROMOSOME_/chr/" > sangerFa/chr${f}.fa mkdir sangerFa/${f} ln -s ~/ce1/sanger/CHROMOSOME_${f}.agp sangerFa/${f}/chr${f}.agp end # hgGoldGap does not handle dir names over 2 characters: mv sangerFa/III sangerFa/3 # CREATE AN .agp FILE FOR ChrM (DONE, 2004-04-19, hartera) ssh hgwdev cd /cluster/data/ce1/sanger cat << '_EOF_' > CHROMOSOME_M.agp M 1 13794 1 F X54252.1 1 13794 + '_EOF_' mkdir -p /cluster/data/ce1/sangerFa/M ln -s /cluster/data/ce1/sanger/CHROMOSOME_M.agp \ /cluster/data/ce1/sangerFa/M/chrM.agp # CREATING DATABASE (DONE 2003-05-12 - Hiram) # Create the database. # next machine ssh hgwdev echo 'create database ce1' | hgsql '' # if you need to delete that database: !!! WILL DELETE EVERYTHING !!! echo 'drop database ce1' | hgsql ce1 # Use df to ake sure there is at least 5 gig free on df -h /var/lib/mysql # Before loading data: # Filesystem Size Used Avail Use% Mounted on # /dev/sda1 472G 389G 58G 87% /var/lib/mysql # CREATING GRP TABLE FOR TRACK GROUPING (DONE 2003-05-12 - Hiram) # next machine ssh hgwdev # the following command copies all the data from the table # grp in the database rn1 to our new database ce1 echo "create table grp (PRIMARY KEY(NAME)) select * from rn1.grp" \ | hgsql ce1 # if you need to delete that table: !!! WILL DELETE ALL grp data !!! echo 'drop table grp;' | hgsql ce1 # PREPARE Split contigs into 100,000 bp chunks for cluster runs # next machine ssh eieio cd ~/ce1 rm -fr ./split mkdir split foreach f (I II III IV V X M) mkdir split/$f faSplit size sangerFa/chr${f}.fa 100000 split/$f/c -lift=split/chr${f}.lft end # 151 pieces of 151 written # 153 pieces of 153 written # 138 pieces of 138 written # 175 pieces of 175 written # 210 pieces of 210 written # 178 pieces of 178 written # 1 pieces of 1 written cat split/*.lft > liftAll.lft # copy them to /iscratch/i for cluster rsync # next machine ssh kkr1u00 cd ~/ce1/split foreach c (I II III IV V X M) echo -n "${c} " mkdir -p /iscratch/i/worms/Celegans/unmaskedSplit/${c} cp -p ${c}/c*.fa /iscratch/i/worms/Celegans/unmaskedSplit/${c} end ~kent/bin/iSync # Run RepeatMasker on the chromosomes (DONE 2003-06-04 - Hiram) # next machine ssh kk cd ~/ce1 mkdir RMRun cd RMRun # create the script to use for the RepeatMasker run cat << '_EOF_' > RMWorm #!/bin/csh -fe # # This is a slight rearrangement of the # /cluster/bin/scripts/RMWorm used in makeHg15.doc # The results here need to go to a different location # $1 == chrom name: I II III IV V X M # $2 == directory where split contig .fa is found # $3 == name of contig .fa file cd $1 pushd . cd $2 /bin/mkdir -p /tmp/$3/$1 /bin/cp $3 /tmp/$3/$1 cd /tmp/$3/$1 /scratch/hg/RepeatMasker/RepeatMasker -s -el -ali $3 popd /bin/cp /tmp/$3/$1/$3.out ./ if( -e /tmp/$3/$1/$3.align ) /bin/cp /tmp/$3/$1/$3.align ./ # /bin/cp /tmp/$2*.masked ../masked/ /bin/rm -r /tmp/$3/$1 /bin/rmdir --ignore-fail-on-non-empty /tmp/$3 '_EOF_' chmod +x RMWorm # create job list rm -f RMJobs foreach c (I II III IV V X M) mkdir ~/ce1/RMRun/${c} foreach t ( /iscratch/i/worms/Celegans/unmaskedSplit/$c/c*.fa ) set d = $t:h set f = $t:t echo ./RMWorm ${c} ${d} ${f} \ '{'check out line+ ~/ce1/RMRun/$c/${f}.out'}' \ >> RMJobs end end para create RMJobs para try para check # when they are finished, liftUp and load the .out files into the database: # next machine ssh eieio cd ~/ce1/RMRun foreach c (I II III IV V X M) liftUp chr${c}.fa.out ../split/chr${c}.lft warn ${c}/*.fa.out end # next machine ssh hgwdev cd ~/ce1/RMRun hgLoadOut ce1 chr*.fa.out # Noticed one error in this load: # Processing chrI.fa.out # Strange perc. field -0.6 line 791 of chrI.fa.out # SIMPLE REPEAT [TRF] TRACK (DONE 2003-06-05 - Hiram) # ensure chr*.fa files exist on /iscratch/i # next machine ssh kkr1u00 cp -p /cluster/store5/worm/ce1/sangerFa/*.fa \ /iscratch/i/worms/Celegans/unmaskedFa ~kent/bin/iSync # Create cluster parasol job: # next machine ssh kk mkdir -p ~/ce1/bed/simpleRepeat cd ~/ce1/bed/simpleRepeat mkdir trf ls -1S /iscratch/i/worms/Celegans/unmaskedFa/chr*.fa > genome.lst cat << '_EOF_' > gsub #LOOP /cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf {check in line+ $(path1)} /dev/null -bedAt={check out line trf/$(root1).bed} -tempDir=/tmp #ENDLOOP '_EOF_' echo "" > dummy.lst gensub2 genome.lst dummy.lst gsub spec para create spec para try # there are only 7, so this runs them all para check Average job time: 468s 7.80m 0.13h 0.01d Longest job: 963s 16.05m 0.27h 0.01d # When cluster run is done, combine into one: cat trf/*.bed > simpleRepeat.bed # Load into the database: # next machine ssh hgwdev cd ~/ce1/bed/simpleRepeat /cluster/bin/i386/hgLoadBed ce1 simpleRepeat simpleRepeat.bed \ -sqlTable=$HOME/src/hg/lib/simpleRepeat.sql # Loaded 28590 elements # PROCESS SIMPLE REPEATS INTO MASK (DONE 2003-06-05 - Hiram) # next machine ssh eieio cd ~/ce1/bed/simpleRepeat mkdir -p trfMask foreach f (trf/*.bed) awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t end # Create Soft and Hard masks from RepeatMaster and TRF outputs: # and rebuild the nib files # next machine ssh eieio cd ~/ce1 mkdir softMask cd ~/ce1 foreach c (I II III IV V X M) echo -n "masking chr${c} " /cluster/bin/i386/maskOutFa sangerFa/chr${c}.fa RMRun/chr${c}.fa.out \ softMask/chr${c}.fa -soft /cluster/bin/i386/maskOutFa softMask/chr${c}.fa \ bed/simpleRepeat/trfMask/chr${c}.bed \ softMask/chr${c}.fa -softAdd faToNib -softMask softMask/chr${c}.fa nib/chr${c}.nib end # output: # masking chrI Writing 15080473 bases in 7540245 bytes # masking chrII Writing 15279303 bases in 7639660 bytes # masking chrIII Writing 13783268 bases in 6891642 bytes # masking chrIV Writing 17493790 bases in 8746903 bytes # masking chrV Writing 20922238 bases in 10461127 bytes # masking chrX Writing 17705013 bases in 8852515 bytes # masking chrM Writing 13794 bases in 6905 bytes # unknown if hardMasks are actually needed for anything mkdir hardMask foreach c (I II III IV V X M) echo "masking chr${c}" /cluster/bin/i386/maskOutFa softMask/chr${c}.fa hard \ hardMask/chr${c}.fa end # With masked nib files ready, prepare cluster for blastz runs # next machine ssh kkr1u00 cd ~/ce1/softMask mkdir -p /iscratch/i/worms/Celegans/bothMasksFa mkdir -p /iscratch/i/worms/Celegans/bothMasksNib cp -p *.fa /iscratch/i/worms/Celegans/bothMasksFa cd ~/ce1/nib cp -p c*.nib /iscratch/i/worms/Celegans/bothMasksNib ~kent/bin/iSync STORING O+O SEQUENCE AND ASSEMBLY INFORMATION (DONE 2003-05-12 - Hiram) # Make symbolic links from /gbdb/ce1/nib to the real nibs. # next machine ssh hgwdev mkdir -p /gbdb/ce1/nib foreach f (/cluster/store5/worm/ce1/nib/*.nib) ln -s $f /gbdb/ce1/nib end cd ~/ce1/tmp # Load /gbdb/ce1/nib paths into database and save size info. hgsql ce1 < ~/kent/src/hg/lib/chromInfo.sql # if you need to delete that table: !!! DELETES ALL DATA IN TABLE !!! echo 'drop table chromInfo;' | hgsql ce1 hgNibSeq -preMadeNib ce1 /gbdb/ce1/nib sangerFa/chr*.fa # Typical output: # Processing chrI.fa to /gbdb/ce1/nib/chrI.nib # Processing chrII.fa to /gbdb/ce1/nib/chrII.nib # Processing chrIII.fa to /gbdb/ce1/nib/chrIII.nib # Processing chrIV.fa to /gbdb/ce1/nib/chrIV.nib # Processing chrM.fa to /gbdb/ce1/nib/chrM.nib # Processing chrV.fa to /gbdb/ce1/nib/chrV.nib # Processing chrX.fa to /gbdb/ce1/nib/chrX.nib # 100277879 total bases # Verify the hgNibSeq load functioned OK: echo "select chrom,size from chromInfo" | hgsql -N ce1 > chrom.sizes cat chrom.sizes # Typical contents of chrom.sizes: chrI 15080473 chrII 15279303 chrIII 13783268 chrIV 17493790 chrM 13794 chrV 20922238 chrX 17705013 # Set up relational mrna tables. hgLoadRna new ce1 # that created a bunch of tables. If you need to delete them: hgLoadRna drop ce1 MAKE GCPERCENT AND GAP tracks (DONE 2003-05-12 - Hiram) # next machine ssh hgwdev mkdir -p /cluster/store5/worm/ce1/bed/gcPercent cd /cluster/store5/worm/ce1/bed/gcPercent hgsql ce1 < ~/kent/src/hg/lib/gcPercent.sql # If you need to delete that table created echo 'drop table gcPercent;' | hgsql ce1; hgGcPercent ce1 ../../nib # hgGoldGap does not handle dir names over 2 characters # directory III has been moved to 3 when all this was created above hgGoldGapGl -noGl ce1 /cluster/store5/worm/ce1 sangerFa # there are no gaps in these chromosomes. They are all making # empty tables except for chrM. We tried running the browser # with these empty tables removed, but then it complained when # we tried to display the assembly track. The browser needs to # be fixed. In the meantime, for completeness, and to allow # featureBits to function, create an empty chrM_gap: cat << '_EOF_' | hgsql ce1 CREATE TABLE chrM_gap ( bin smallint not null, chrom varchar(255) not null, chromStart int unsigned not null, chromEnd int unsigned not null, ix int not null, n char(1) not null, size int unsigned not null, type varchar(255) not null, bridge varchar(255) not null, UNIQUE(chromStart), UNIQUE(chromEnd) ); '_EOF_' # hgGoldGapGl needs to be fixed so it can create a gold track # and not need to create a gap track. And the browser needs to # be fixed so it can display the assembly track without the need # for the gap tables to exist. # REMAKE GAP TRACKS - ADD POSITIONS OF Ns TO ChrN_gap TABLES # (DONE, 2004-04-30, hartera) # Add Ns to gap tables as for Ce2, this is described on Gap details page # next machine ssh hgwdev mkdir -p /cluster/data/ce1/bed/gap cd /cluster/data/ce1/bed/gap # finds motifs and finds location of gaps as part of output foreach c (I II III IV V X M) findMotif -chr=chr${c} -verbose=4 -motif=gcatg \ /gbdb/ce1/nib >& chr${c}Bed.stderr end grep -h GAP *.stderr | sed -e "s/#GAP //" > gap.bed # All the gap tables are empty so # Load in gap.bed # Need to add extra fields to gap.bed file cat << '_EOF_' > /cluster/data/ce1/scripts/createGapFile.pl #!/usr/bin/perl -w use strict; my $oldchr = ""; while () { my @fields = split(/\t/); my $chr = $fields[0]; if ($chr ne $oldchr) { open(OUT, ">$chr"."_gap.bed"); $oldchr = $chr; } print OUT "$fields[0]\t$fields[1]\t$fields[2]\t$fields[3]\tN\t$fields[4]\tfragment\tyes\n"; } '_EOF_' perl /cluster/data/ce1/scripts/createGapFile.pl < gap.bed # load into relevant tables foreach c (I II III IV V X M) hgLoadBed -tab -oldTable ce1 chr${c}_gap chr${c}_gap.bed end # MAKE chrM_gold TABLE FROM NEW chrM.agp FILE CREATED ABOVE # (DONE, 2004-04-19, hartera) # next machine ssh hgwdev cd /cluster/data/ce1/ hgGoldGapGl -noGl -chrom=chrM ce1 /cluster/data/ce1 sangerFa MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE FOR CE1 (DONE 2002-05-12 - Hiram) # next machine ssh hgwdev echo 'insert into defaultDb values("C. elegans", "ce1");' \ | hgsql -h genome-testdb hgcentraltest # If you need to delete that entry: echo 'delete from defaultDb where name="ce1";' \ | hgsql -h genome-testdb hgcentraltest # Note: for next assembly, set scientificName column to # "Caenorhabditis elegans" echo 'insert into dbDb values("ce1", "May 2003", \ "/gbdb/ce1/nib", "Worm", "chrII:14642289-14671631", 1, 10, \ "C. elegans");' \ | hgsql -h genome-testdb hgcentraltest # If you need to delete that entry: echo 'delete from dbDb where name="ce1";' \ | hgsql -h genome-testdb hgcentraltest # Make trackDb table so browser knows what tracks to expect: cd ~/kent/src/hg/makeDb/trackDb cvs up -d -P # Edit that makefile to add ce1 in all the right places and do make update make alpha cvs commit makefile MAKE SANGER GENE TRACK (DONE - 2003-05-19 - Hiram) # next machine ssh eieio mkdir -p /cluster/store5/worm/ce1/bed/sangerGene cd /cluster/store5/worm/ce1/bed/sangerGene # the perl trims these files down to a reasonable size. As they are # they cause ldHgGene to hangup due to memory size. foreach f (I II III IV V X) echo -n "${f} " zcat ../../sanger/CHROMOSOME_${f}.gff.gz | \ sed -e "s/CHROMOSOME_III/chrIII/g" -e "s/CHROMOSOME_II/chrII/g" \ -e "s/CHROMOSOME_IV/chrIV/g" -e "s/CHROMOSOME_I/chrI/g" \ -e "s/CHROMOSOME_X/chrX/g" -e "s/CHROMOSOME_V/chrV/g" \ -e 's/Sequence "\(.*\)"$/\1/' -e 's/Transcript "\(.*\)"$/\1/' | \ perl -ne '@a = split; \ print if($a[1] =~ /curated|DNA|RNA/i && \ $a[2] =~ /intron|exon|cds|sequence|transcri/i);' > chr${f}.gff end echo # check file sizes, should be reasonable ls -ogrt # [hiram@eieio sangerGene]$ ls -ogrt # total 22656 # -rw-r--r-- 1 hiram 3495873 May 19 11:52 chrI.gff # -rw-r--r-- 1 hiram 3670984 May 19 11:54 chrII.gff # -rw-r--r-- 1 hiram 3264917 May 19 11:56 chrIII.gff # -rw-r--r-- 1 hiram 3642256 May 19 11:59 chrIV.gff # -rw-r--r-- 1 hiram 4704564 May 19 12:02 chrV.gff # -rw-rw-r-- 1 hiram 3787721 May 19 12:03 chrX.gff # now load database with those transformed gff files # next machine ssh hgwdev cd /cluster/store5/worm/ce1/bed/sangerGene ldHgGene ce1 sangerGene *.gff # Read 37330 transcripts in 411148 lines in 6 files # 37330 groups 6 seqs 11 sources 6 feature types # 22128 gene predictions # if you need to delete that table: echo 'drop table sangerGene' | hgsql ce1 # MAKING AND STORING mRNA AND EST ALIGNMENTS (WORKING 2003-05-28 - Hiram) # next machine ssh kkr1u00 cd /iscratch/i/worms/Celegans mkdir mRNA # there are 1644 sequences in mrna.fa, the 1000000 makes 1644 files faSplit sequence /iscratch/i/mrna.134/Caenorhabditis_elegans/mrna.fa \ 1000000 mRNA/m ~kent/bin/iSync # next machine ssh kk cd ~/ce1/bed mkdir mrna est cd mrna mkdir psl ls -1S /iscratch/i/worms/Celegans/bothMasksNib/chr*.nib > worm.lst ls -1S /iscratch/i/worms/Celegans/mRNA/m*.fa > mrna.lst cat << '_EOF_' > gsub #LOOP /cluster/bin/i386/blat -fine -q=rna -mask=lower {check in exists+ $(path1)} {check in line+ $(path2)} {check out line+ psl/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' gensub2 worm.lst mrna.lst gsub spec para create spec para try para check para push ... etc ... # These jobs are a bit on the short side. I wonder if the mrna # fa's could be put together in larger chunks ? # Average job time: 8s 0.13m 0.00h 0.00d # Longest job: 63s 1.05m 0.02h 0.00d # cluster run done pslSort dirs raw.psl /tmp psl pslReps -minAli=0.98 -sizeMatters -nearTop=0.005 raw.psl all_mrna.psl \ /dev/null pslSortAcc nohead chrom /tmp all_mrna.psl # load mrna tables # next machine ssh hgwdev cd ~/ce1/bed/mrna/chrom # names must be chr*_mrna.psl foreach d (I II III IV V X M) rm -f chr${d}_mrna.psl mv chr${d}.psl chr${d}_mrna.psl end # this load command rewrites these table entirely if they exist hgLoadPsl ce1 *.psl mkdir /gbdb/ce1/mrna.134 ln -s /cluster/store5/mrna.134/org/Caenorhabditis_elegans/mrna.fa \ /gbdb/ce1/mrna.134 # ! ! ! DO NOT RUN hgLoadRna in /gbdb - it leaves .tab files cd ~/ce1/bed/mrna hgLoadRna add -type=mRNA ce1 /gbdb/ce1/mrna.134/mrna.fa \ /cluster/store5/mrna.134/org/Caenorhabditis_elegans/mrna.ra hgLoadPsl ce1 all_mrna.psl # setup /iscratch/i with the est.fa file for blat run # next machine ssh kkr1u00 mkdir -p /iscratch/i/worms/Celegans/EST cd /iscratch/i/worms/Celegans/EST # create about a 200 files to provide about 1400 jobs to the cluster # (these jobs are very small despite this small number of est files) faSplit sequence \ /cluster/store5/mrna.134/org/Caenorhabditis_elegans/est.fa 200 e ~kent/bin/iSync # next machine ssh kk cd ~/ce1/bed/est mkdir psl ls -1S /iscratch/i/worms/Celegans/bothMasksNib/chr*.nib > worm.lst ls -1S /iscratch/i/worms/Celegans/EST/e*.fa > est.lst cat << '_EOF_' > gsub #LOOP /cluster/bin/i386/blat -mask=lower {check in exists+ $(path1)} {check in line+ $(path2)} {check out line+ psl/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' gensub2 worm.lst est.lst gsub spec para create spec para try para check para push # These 1358 jobs are pretty short: # Average job time: 43s 0.71m 0.01h 0.00d # Longest job: 77s 1.28m 0.02h 0.00d # cluster run done # next machine ssh hgwdev cd ~/ce1/bed/est pslSort dirs raw.psl /tmp psl pslReps -minAli=0.98 -sizeMatters -nearTop=0.005 raw.psl all_est.psl \ /dev/null pslSortAcc nohead chrom /tmp all_est.psl cd chrom # names must be chr*_est.psl foreach d (I II III IV V X M) rm -f chr${d}_est.psl mv chr${d}.psl chr${d}_est.psl end # this load command rewrites these table entirely if they exist hgLoadPsl ce1 *.psl ln -s /cluster/store5/mrna.134/org/Caenorhabditis_elegans/est.fa \ /gbdb/ce1/mrna.134 # ! ! ! DO NOT RUN hgLoadRna in /gbdb - it leaves .tab files cd ~/ce1/bed/est hgLoadRna add -mrnaType=EST -type=EST ce1 /gbdb/ce1/mrna.134/est.fa \ /cluster/store5/mrna.134/org/Caenorhabditis_elegans/est.ra hgLoadPsl ce1 all_est.psl -nobin MAKE HGCENTRALTEST BLATSERVERS ENTRY FOR CE1 (DONE - 2003-05-29 - Hiram) # next machine ssh hgwdev # DNA port is "0", trans prot port is "1" echo 'insert into blatServers values("ce1", "blat10", "17784", "0"); \ insert into blatServers values("ce1", "blat10", "17785", "1");' \ | hgsql -h genome-testdb hgcentraltest # if you need to delete those entries echo 'delete from blatServers where db="ce1";' \ | hgsql -h genome-testdb hgcentraltest # to check the entries: echo 'select * from blatServers where db="ce1";' \ | hgsql -h genome-testdb hgcentraltest PRODUCING FUGU FISH ALIGNMENTS (DONE 2003-06-05 - Hiram) # Assumes masked NIBs have been prepared as above # and Fugu pieces are already on kluster /iscratch/i. # next machine ssh kk mkdir -p ~/ce1/bed/blatFugu cd ~/ce1/bed/blatFugu mkdir psl ls -1S /iscratch/i/fugu/*.fa > fugu.lst ls -1S /iscratch/i/worms/Celegans/bothMasksNib/chr*.nib > worm.lst cat << '_EOF_' > gsub #LOOP /cluster/bin/i386/blat -q=dnax -t=dnax -mask=lower {check in exists+ $(path1)} {check in line+ $(path2)} {check out line+ psl/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' gensub2 worm.lst fugu.lst gsub spec para create spec para try para check para push para check # Average job time: 237s 3.95m 0.07h 0.00d # Longest job: 1326s 22.10m 0.37h 0.02d # want names chrI_blatFugu # When cluster run is done, sort alignments # next machine ssh eieio cd ~/ce1/bed/blatFugu foreach d (I II III IV V X M) echo -n "${d} " pslCat psl/chr${d}_*.psl | \ pslSortAcc nohead chrom temp stdin rm -f chrom/chr${d}_blatFugu.psl mv chrom/chr${d}.psl chrom/chr${d}_blatFugu.psl end # next machine ssh hgwdev cd ~/ce1/bed/blatFugu/chrom hgLoadPsl ce1 chr*_blatFugu.psl # Make fugu /gbdb/ symlink and load Fugu sequence data. mkdir /gbdb/ce1/fuguSeq ln -s /cluster/store3/fuguSeq/fugu_v3_mask.fasta /gbdb/ce1/fuguSeq # ! ! ! DO NOT RUN hgLoadRna in /gbdb - it leaves .tab files cd ~/ce1/bed/blatFugu hgLoadRna addSeq ce1 /gbdb/ce1/fuguSeq/fugu_v3_mask.fasta # RUN Waba alignment with briggsae (DONE - 2003-06-02 - Hiram) # prepare contigs from C. briggsae # Assumes C. briggsae data has been downloaded according to # makeCb1.doc # next machine ssh kkr1u00 mkdir -p /iscratch/i/worms/Cbriggsae/waba_contigs cd /iscratch/i/worms/Cbriggsae/waba_contigs faSplit sequence ../contigs.fa 2000 c ~kent/bin/iSync # next machine ssh kk mkdir ~/ce1/bed/waba/out cd ~/ce1/bed/waba ls /iscratch/i/worms/Cbriggsae/contigs/c*.fa > briggsae.lst ls /iscratch/i/worms/Celegans/unmaskedFa/chr*.fa > elegans.lst echo "" > dummy.lst # create scripts to be used here cat << '_EOF_' > wabaRun #!/bin/csh -fe # # $1 - full pathname to a briggsae contig # $2 - file path to an elegans chrom.fa # $3 - result file full pathname # set f = $1:t set chr = $2:t set d = $chr:r mkdir -p /tmp/$d/$f cp $1 /tmp/$d/$f pushd . cd /tmp/$d/$f set t = $f:r /cluster/home/hiram/bin/i386/waba 1 $f $2 $t.1 /cluster/home/hiram/bin/i386/waba 2 $t.1 $t.2 /cluster/home/hiram/bin/i386/waba 3 $t.2 $t.wab cp $t.wab $3 popd rm -f /tmp/$d/$f/$t.* rmdir --ignore-fail-on-non-empty /tmp/$d/$f rmdir --ignore-fail-on-non-empty /tmp/$d '_EOF_' chmod +x wabaRun echo cat << '_EOF_' > gsub #LOOP /cluster/store5/worm/ce1/bed/waba/wabaRun {check in exists+ $(path1)} {check in exists+ $(path2)} {check out exists /cluster/store5/worm/ce1/bed/waba/out/$(root2)/$(root1).wab} #ENDLOOP '_EOF_' gensub2 briggsae.lst elegans.lst gsub spec para create spec para try para check para push ... etc ... # one of the jobs takes quite a while. Most of the others are OK: # Average job time: 2308s 38.46m 0.64h 0.03d # Longest job: 47442s 790.70m 13.18h 0.55d # next machine ssh hgwdev # you may need to build hgWaba cd ~/kent/src/hg/makeDb/hgWaba make # hgWaba.c has been adjusted to avoid the squeezed assert by # ignoring them. That may want to be looked at some day. # (The waba cluster run has a number of errors too. waba is not # perfect, but it is as good as the Intronerator was...) cd ~/ce1/bed/waba # one of the outputs produces a result that won't process properly # with hgWaba. Move it out of the way mv out/chrII/c0911.wab out/chrII_c0911.wab.broken mkdir Load cd Load # The cat through the pipe to hgWaba will avoid making large files # that are not needed. cat << '_EOF_' > loadEm.sh #!/bin/sh # for c in I II III IV V X M do echo -n "${c} " cat /cluster/store5/worm/ce1/bed/waba/out/chr${c}/c*.wab | ~/bin/i386/hgWaba ce1 Cbr chr${c} 0 stdin > proc${c}.out 2>&1 done exit 0 '_EOF_' # run it to load the waba track: ./loadEm.sh # remove garbage temp file: rm full_waba.tab rm chrom_waba.tab # worm/ce1/trackDb.ra entry: # track cbrWaba # shortLabel Briggsae Waba # longLabel C. briggsae Waba Alignments # group genes # priority 50 # visibility dense # color 140,0,200 # altColor 210,140,250 # BEGINNING OF Cb1 BLASTZ # next machine ssh kkr1u00 # These nibs were created in the combined RepeatMasker and TRF above mkdir -p /iscratch/i/worms/Celegans/rmsk cd /iscratch/i/worms/Celegans/rmsk cp -p /cluster/store5/worm/ce1/RMRun/chr*.fa.out . ~kent/bin/iSync # These blastz sections usually start by saying: # FIRST FOLLOW INSTRUCTIONS IN ~angie/hummus/Notes # This Notes file is the actual blastz run. # But I found those notes a bit obscure, for clarity I have # reproduced the exact steps here. # The initial DEF file has been created for this. # It is in ~angie/hummus/DEF.ce1-cb1.2003-05-29 # which will blastz soft masked Cbriggsae ultra contigs # against the soft masked Celegans chroms # 1. created the DEF file for this particular blastz run # The TARGET files are the masked NIBS created above, in: # SEQ1_DIR=/iscratch/i/worms/Celegans/bothMasksNib # and the repeatmasker *.out files in: # SEQ1_RMSK=/iscratch/i/worms/Celegans/rmsk # There is no lineage specific repeats for the worms as far as # I can tell, thus: # SEQ1_SMSK=/dev/null # This requires: # BLASTZ_ABRIDGE_REPEATS=0 # The QUERY sequence are the repeat and trf soft masked chrUn.fa # nib file from the briggsae processing: # SEQ2_DIR=/iscratch/i/worms/Cbriggsae/bothMasksNib # And the repeat masker run from the briggsae processing: # SEQ2_RMSK=/iscratch/i/worms/Cbriggsae/rmsk # No lineage specific repeats: # SEQ2_SMSK=/dev/null # # Our base of operations: # BASE=/cluster/store5/worm/ce1/bed/blastzCb1 # # Reproduced here are the instructions as specified in the # hummus/Notes file. # Evidently historical archives of the DEF files exist in # in ~angie/hummus/DEF* and they are copied out of there to # to the file: $BASE/DEF for the actual run. Don't worry, none of # this seems to make any sense. # next machine ssh kk cd ~/ce1/bed/blastzCb1 # source the DEF file which was carefully written as above. . ./DEF mkdir -p $BASE/run ~angie/hummus/make-joblist $DEF > $BASE/run/j # that created xdir.sh and joblist run/j sh $BASE/xdir.sh # xdir.sh makes a bunch of result directories in $BASE/raw/ # based on chrom name and CHUNK size cd $BASE/run # now edit j because it is not correct: sed -e 's#^#/cluster/home/angie/schwartzbin/#' j > j2 wc -l j* head j2 # *** make sure the j2 edits are OK, then use it: mv j2 j # para create will create the file: 'batch' for the cluster run para create j para try para check para push ... etc ... Average job time: 1222s 20.36m 0.34h 0.01d Longest job: 3772s 62.87m 1.05h 0.04d # When that cluster run is done, results are in $BASE/raw/chr*/* # continuing with ~angie/hummus/Notes: # --- normalize and single_cov cd ~/ce1/bed/blastzCb1 # source the DEF file again in case you are coming back to this . ./DEF # a new run directory mkdir -p $BASE/run.1 # another obscure script creates a new job list: ~angie/hummus/do.out2lav $DEF >$BASE/run.1/j cd $BASE/run.1 # the job list is once again incorrect, edit it: sed -e 's/^/\/cluster\/home\/angie\/schwartzbin\//' j > j2 wc -l j* head j2 # make sure the edited j2 is OK, then use it: mv j2 j para create j para try para push;para check; ... etc ... Average job time: 65s 1.08m 0.02h 0.00d Longest job: 136s 2.27m 0.04h 0.00d # Translate the .lav files created by the end of ~angie/hummus Notes # into axt files ssh eieio set base="/cluster/store5/worm/ce1/bed/blastzCb1" set seq1_dir="/cluster/store5/worm/ce1/nib" set seq2_dir="/cluster/store5/worm/cb1/nib" set tbl="blastzCb1" cd $base mkdir -p axtChrom foreach c (lav/*) pushd $c set chr=$c:t set out=$base/axtChrom/$chr.axt echo "Translating $chr lav to $out" cat `ls -1 *.lav | sort -g` \ | /cluster/home/hiram/bin/i386/lavToAxt stdin $seq1_dir $seq2_dir stdout \ | /cluster/bin/i386/axtSort stdin $out popd end # Translate the sorted axt files into psl: cd $base mkdir -p pslChrom foreach f (axtChrom/chr*.axt) set c=$f:t:r /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl end # load these blastz results # next machine ssh hgwdev cd ~/ce1/bed/blastzCb1/pslChrom /cluster/bin/i386/hgLoadPsl ce1 chr*_*.psl # trackDb/worm/ce1/trackDb.ra entry: # track blastzCb1 # shortLabel briggsae Blastz # longLabel Blastz C briggsae # group compGeno # priority 159 # visibility dense # color 0,0,0 # altColor 50,128,50 # spectrum on # type psl xeno cb1 # CHAINING Briggsae blastz # next machine, small cluster is good for these tiny jobs ssh kkr1u00 cd /cluster/store5/worm/ce1/bed/blastzCb1 mkdir axtChain cd axtChain mkdir run1 cd run1 ls -1S ../../axtChrom/*.axt > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out} #ENDLOOP '_EOF_' cat << '_EOF_' > doChain #!/bin/csh axtChain $1 /cluster/store5/worm/ce1/nib /cluster/store5/worm/cb1/nib $2 > $3 '_EOF_' chmod a+x doChain mkdir out chain gensub2 input.lst single gsub spec para create spec para try Average job time: 303s 5.04m 0.08h 0.00d Longest job: 1006s 16.77m 0.28h 0.01d # now on the cluster server, sort chains # next machine ssh eieio cd ~/ce1/bed/blastzCb1/axtChain chainMergeSort run1/chain/*.chain > all.chain chainSplit chain all.chain # optionally: rm run1/chain/*.chain # Load chains into database # next machine ssh hgwdev cd ~/ce1/bed/blastzCb1/axtChain/chain foreach i (*.chain) set c = $i:r hgLoadChain ce1 ${c}_cb1Chain $i echo done $c end # Create the nets XXXX STOPPED HERE 2003-06-11 # next machine ssh eieio cd ~/ce1/bed/blastzCb1/axtChain mkdir preNet cd chain # MAKING AND STORING refSeq ALIGNMENTS (WORKING 2003-06-02 - Hiram) # MAKING AND STORING refSeq ALIGNMENTS - hgRefSeqMrna load REDONE (2004-05-18, hartera) # hgRefSeqStatus - refSeqStatus table rebuilt with updated loc2ref # (2004-06-16, hartera) # next machine ssh kkr1u00 mkdir -p /iscratch/i/worms/Celegans/refSeq cd /iscratch/i/worms/Celegans # there are about 20000 sequences in the refSeq.fa file, # split them into about 300 files faSplit sequence \ /cluster/store5/mrna.134/refSeq/041003/org/Caenorhabditis_elegans/refSeq.fa \ 300 refSeq/r ~kent/bin/iSync # next machine ssh kk cd ~/ce1/bed mkdir -p refSeq/psl cd refSeq # for reference: ln -s \ /cluster/store5/mrna.134/refSeq/041003/org/Caenorhabditis_elegans/refSeq.* . ls -1S /cluster/bluearc/iscratch/i/worms/Celegans/bothMasksNib/chr*.nib > worm.lst ls -1S /cluster/bluearc/iscratch/i/worms/Celegans/refSeq/r*.fa > refSeq.lst cat << '_EOF_' > gsub #LOOP /cluster/bin/i386/blat -fine -q=rna -mask=lower {check in exists+ $(path1)} {check in line+ $(path2)} {check out line+ psl/$(root1)_$(root2).psl} #ENDLOOP '_EOF_' gensub2 worm.lst refSeq.lst gsub spec para create spec para try para check para push ... etc ... # Average job time: 55s 0.92m 0.02h 0.00d # Longest job: 626s 10.43m 0.17h 0.01d # when cluster run done # next machine ssh eieio cd ~/ce1/bed/refSeq pslSort dirs raw.psl /tmp psl pslReps -minAli=0.98 -sizeMatters -nearTop=0.005 raw.psl all_refSeq.psl \ /dev/null pslSortAcc nohead chrom /tmp all_refSeq.psl pslCat -dir chrom > refSeqAli.psl # load refSeq tables # next machine ssh hgwdev cd ~/ce1/bed/refSeq hgLoadPsl ce1 -tNameIx refSeqAli.psl ln -s \ /cluster/store5/mrna.134/refSeq/041003/org/Caenorhabditis_elegans/refSeq.fa \ /gbdb/ce1/mrna.134 # ! ! DO NOT RUN hgLoadRna in /gbdb - it leaves .tab files cd ~/ce1/bed/refSeq hgLoadRna add -type=refSeq ce1 /gbdb/ce1/mrna.134/refSeq.fa \ /cluster/store5/mrna.134/refSeq/041003/org/Caenorhabditis_elegans/refSeq.ra # Reload refLink table as LocusLink ID column is all 0 and protein # accessions missing. (DONE, 2004-05-12, hartera) # RefSeq accessions for ce1 appear not to be in # /cluster/store5/mrna.134/refSeq/041003/loc2ref so could not also # map the LocusLink ID to OMIM ID through mim2loc # Dowload the latest loc2ref and mim2loc from NCBI and put in # /cluster/store5/mrna.134/refSeq/051204/ # Previous error in loading refPep was because there were duplicate # accessions in hs.faa once the suffix of .1 and .2 were removed # A search in GenBank showed that NP_002900.1 has been replaced # by NP_002900.2 so removed the former from hs.faa # (2004-05-18, hartera) Deleted contents of refPep table since # hs.faa contains only human sequences. No C. elegans peptide sequences # available from this NCBI Build 134 so added protein translation of genomic DNA # on the RefSeq genes details pages. hgRefSeqMrna ce1 /gbdb/ce1/mrna.134/refSeq.fa \ /cluster/store5/mrna.134/refSeq/041003/org/Caenorhabditis_elegans/refSeq.ra \ all_refSeq.psl \ /cluster/store5/mrna.134/refSeq/051204/loc2ref \ /cluster/store5/mrna.134/refSeq/041003/hs.faa \ /cluster/store5/mrna.134/refSeq/051204/mim2loc # Missing locusLinkIds for 184 of 20661 # Missing protein accessions for 184 of 20661 # Add RefSeq status info (updated 2004-06-16 as mrnaAcc values not found # in refLink.mrnaAcc column). Only load in those for C. elegans (Taxonomy # ID is 6239) cd /cluster/data/ce1/bed/refSeq hgRefSeqStatus -taxId=6239 ce1 \ /cluster/store5/mrna.134/refSeq/051204/loc2ref # Create precomputed join of refFlat and refGene: echo 'CREATE TABLE refFlat \ (KEY geneName (geneName), KEY name (name), KEY chrom (chrom)) \ SELECT refLink.name as geneName, refGene.* \ FROM refLink,refGene \ WHERE refLink.mrnaAcc = refGene.name' \ | hgsql ce1 # END - MAKING AND STORING refSeq ALIGNMENTS # BEGINNING OF Self BLASTZ # DEF file here is the same as the blastzCb1/DEF file, with # the following changes: # SEQ1_DIR=/iscratch/i/worms/Celegans/bothMasksNib # SEQ2_DIR=/iscratch/i/worms/Celegans/bothMasksNib # SEQ2_RMSK=/iscratch/i/worms/Celegans/rmsk # SEQ2_CHUNK=10000000 # SEQ2_LAP=10000 # BASE=/cluster/store5/worm/ce1/bed/blastzSelf # Reproduced here are the instructions as specified in the # hummus/Notes file. # Evidently historical archives of the DEF files exist in # in ~angie/hummus/DEF* and they are copied out of there to # to the file: $BASE/DEF for the actual run. Don't worry, none of # this seems to make any sense. # next machine ssh kk mkdir -p ~/ce1/bed/blastzSelf cd ~/ce1/bed/blastzSelf . ./DEF mkdir -p $BASE/run ~angie/hummus/make-joblist $DEF > $BASE/run/j # that created xdir.sh and joblist run/j sh $BASE/xdir.sh # xdir.sh makes a bunch of result directories in $BASE/raw/ # based on chrom name and CHUNK size cd $BASE/run # now edit j because it is not correct: sed -e 's#^#/cluster/home/angie/schwartzbin/#' j > j2 wc -l j* head j2 # *** make sure the j2 edits are OK, then use it: mv j2 j # para create will create the file: 'batch' for the cluster run para create j para try para check para push ... etc ... Average job time: 2228s 37.14m 0.62h 0.03d Longest job: 15281s 254.68m 4.24h 0.18d # That longest job time is not correct. I was having trouble with # the cluster during that run. # When that cluster run is done, results are in $BASE/raw/chr*/* # continuing with ~angie/hummus/Notes: # --- normalize and single_cov cd ~/ce1/bed/blastzSelf # source the DEF file again in case you are coming back to this . ./DEF # a new run directory mkdir -p $BASE/run.1 # another obscure script creates a new job list: ~angie/hummus/do.out2lav $DEF >$BASE/run.1/j cd $BASE/run.1 # the job list is once again incorrect, edit it: sed -e 's/^/\/cluster\/home\/angie\/schwartzbin\//' j > j2 wc -l j* head j2 # make sure the edited j2 is OK, then use it: mv j2 j para create j para try para push;para check; ... etc ... Average job time: 241s 4.01m 0.07h 0.00d Longest job: 459s 7.65m 0.13h 0.01d # Translate the .lav files created by the end of ~angie/hummus Notes # into axt files ssh eieio set base="/cluster/store5/worm/ce1/bed/blastzSelf" set seq1_dir="/cluster/store5/worm/ce1/nib" set seq2_dir="/cluster/store5/worm/ce1/nib" set tbl="blastzSelf" cd $base mkdir -p axtChrom foreach c (lav/*) pushd $c set chr=$c:t set out=$base/axtChrom/$chr.axt echo "Translating $chr lav to $out" cat `ls -1 *.lav | sort -g` \ | /cluster/home/hiram/bin/i386/lavToAxt stdin $seq1_dir $seq2_dir stdout \ | /cluster/bin/i386/axtSort stdin $out popd end # DropOverlap cleans out a bit more than DropSelf: cd $base mkdir DropOverlap foreach c (M I II III IV V X) echo "dropping chr${c}" /cluster/bin/i386/axtDropOverlap axtChrom/chr${c}.axt \ DropOverlap/chr${c}.axt end # Translate the sorted axt files into psl: mkdir -p pslChrom foreach f (DropOverlap/chr*.axt) set c=$f:t:r /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl end # load these blastz results (they already have the correct names # chr*_blastzSelf.psl from axtToPsl above) # next machine ssh hgwdev cd ~/ce1/bed/blastzSelf/pslChrom /cluster/bin/i386/hgLoadPsl ce1 chr*_*.psl # trackDb/worm/ce1/trackDb.ra entry: # track blastzSelf # shortLabel Blastz Self # longLabel Blastz C elegans (self) # group compGeno # priority 159 # visibility dense # color 0,0,0 # altColor 50,128,50 # spectrum on # type psl xeno ce1 # CHAINING Self blastz # next machine, small cluster is good for these tiny jobs ssh kkr1u00 cd /cluster/store5/worm/ce1/bed/blastzSelf mkdir -p axtChain/run1 cd /cluster/store5/worm/ce1/bed/blastzSelf/axtChain/run1 ls -1S /cluster/store5/worm/ce1/bed/blastzSelf/DropOverlap/*.axt > input.lst cat << '_EOF_' > gsub #LOOP doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out} #ENDLOOP '_EOF_' # Nibs are on /cluster/bluearc now cat << '_EOF_' > doChain #!/bin/csh axtChain $1 /cluster/bluearc/iscratch/i/worms/Celegans/bothMasksNib /iscratch/i/worms/Celegans/bothMasksNib $2 > $3 '_EOF_' chmod a+x doChain mkdir out chain gensub2 input.lst single gsub spec para create spec para try para push ... etc ... Average job time: 502s 8.37m 0.14h 0.01d Longest job: 874s 14.57m 0.24h 0.01d # now on the cluster server, sort chains # next machine ssh eieio cd ~/ce1/bed/blastzSelf/axtChain chainMergeSort run1/chain/*.chain > all.chain chainSplit chain all.chain # optionally: rm run1/chain/*.chain # Load chains into database # next machine ssh hgwdev cd ~/ce1/bed/blastzSelf/axtChain/chain foreach i (*.chain) set c = $i:r hgLoadChain ce1 ${c}_chainSelf $i echo done $c end # Loading 845504 chains into ce1.chrI_chainSelf # Loading 845307 chains into ce1.chrII_chainSelf # Loading 1152790 chains into ce1.chrIII_chainSelf # Loading 859566 chains into ce1.chrIV_chainSelf # Loading 2 chains into ce1.chrM_chainSelf # Loading 1114939 chains into ce1.chrV_chainSelf # Loading 604293 chains into ce1.chrX_chainSelf # END of Blastz Self LOAD SOFTBERRY GENES (DONE - 2003-08-14 - Hiram) mkdir -p /cluster/data/ce1/bed/softberry cd /cluster/data/ce1/bed/softberry wget \ ftp://www.softberry.com/pub/SC_CEL_MAY03/Soft_fgenesh_Ce03.tar.gz tar xvzf Soft_fgenesh_Ce03.tar.gz ldHgGene ce1 softberryGene chr*.gff # Read 19112 transcripts in 276202 lines in 6 files # 19112 groups 6 seqs 1 sources 4 feature types # 19112 gene predictions hgPepPred ce1 softberry *.protein hgSoftberryHom ce1 *.protein # Add entry to worm/ce1/trackDb.ra and add specific # softberryGene.html file CREATE ORFTOGENE TABLE (Used by hgGene and hgNear) mkdir /cluster/data/ce1/bed/orfToGene cd !$ wget ftp://ftp.wormbase.org/pub/wormbase/GENE_DUMPS/gene_names.txt awk -F '\t' '$2 == "Caenorhabditis elegans" && $3 == "Gene" {printf("%s\t%s\n", $4, $1)}' gene_names.txt > orfToGene.txt hgCeOrfToGene ce1 gene_names.txt sangerGene orfToGene CREATE SANGER LINKS TABLE FROM WORMBASE INFO (used by hgGene and hgNear) mkdir /cluster/data/ce1/bed/steinHelp # Mail Lincoln Stein and ask for mappings from wormPep to # SwissProt and from ORF to wormPep to description. Place # these files in the steinHelp directory: # seqPepDesc.ace - contains ORF/WormPep/Description # spWp.ace - contains WormPep/SwissProt # Then go to the AQL search page at # http://www.wormbase.org/db/searches/aql_query. # and create the following files in that directory: # wbGeneClass.txt - associates 3 letter first part of worm gene # name with a brief description. Produced with query: # select l,l->Description from l in class Gene_Class # wbConcise.txt - associates worm gene name with a several # sentence description. Produces with query: # select l,g from l in class Locus, g in l->Gene_information[Provisional_description] hgWormLinks spWp.ace seqPepDesc.ace wbConcise.txt wbGeneClass.txt ../orfToGene/orfToGene sangerLinks.txt hgsql ce1 < ~/kent/src/hg/near/hgWormLinks/sangerLinks.sql hgsql -e \ 'load data local infile "sangerLinks.txt" into table sangerLinks.sql' \ ce1 TWINSCAN GENE PREDICTIONS (DONE - 2004-01-05 - Hiram) mkdir /cluster/data/ce1/bed/twinscan cd /cluster/data/ce1/bed/twinscan wget --timestamping \ http://genes.cs.wustl.edu/predictions/worm/C_elegans/WS100/12-15-2003/Celegans_WS100_12-15-03.tgz tar xvzf Celegans_WS100_12-15-03.tgz rm -r chr_tx # clean up chrom field of GTF files foreach c (I II III IV V X) set f = chr_gtf/CHROMOSOME_${c}.pred_gtf echo ${f} sed -e "s/CHROMOSOME_/chr/g" ${f} | \ sed -e "s/_[0-9][0-9]*.seq\t/\t/" > \ chr_gtf/chr${c}-fixed.gtf end # pare down protein FASTA header to id and add missing .1: foreach c (I II III IV V X) set f = chr_ptx/CHROMOSOME_${c}.ptx echo ${f} perl -wpe 's/^\>CHROMOSOME_(\S+).*$/\>chr$1.1/;' < \ ${f} > chr_ptx/chr${c}-fixed.fa end ldHgGene ce1 twinscan chr_gtf/chr*-fixed.gtf -exon=CDS # Read 20889 transcripts in 146518 lines in 6 files # 20889 groups 206 seqs 1 sources 3 feature types # 20889 gene predictions hgPepPred ce1 generic twinscanPep chr_ptx/chr*-fixed.fa # MYTOUCH FIX - Jen - 2006-01-25 sudo mytouch ce1 twinscanPep 0404031400.00 # MRNAORIENT, creating tables for clusterRna operation # (working 2004-02-10 - Hiram) mkdir -p /cluster/bluearc/ce1/bed/est mkdir -p /cluster/bluearc/ce1/bed/mrna cd /cluster/bluearc/ce1/bed/est cp -p /cluster/data/ce1/bed/est/chrom/*.psl . pslSortAcc nohead contig /tmp/psl *.psl cd /cluster/bluearc/ce1/bed/mrna cp -p /cluster/data/ce1/bed/mrna/chrom/*.psl . pslSortAcc nohead contig /tmp/psl *.psl mkdir mRNA faSplit sequence /cluster/store5/mrna.134/org/Caenorhabditis_elegans/mrna.fa 1000000 mRNA/m mkdir -p /cluster/data/ce1/bed/mrnaOrientInfo/oi cd /cluster/data/ce1/bed/mrnaOrientInfo ls -1S /cluster/bluearc/ce1/bed/mrna/contig/*.psl > psl.lst ls -1S /iscratch/i/worms/Celegans/mrna.134/mrna.fa > mrna.lst cat << '_EOF_' > gsub #LOOP /cluster/bin/i386/polyInfo {check in line $(path1)} {check in line+ /iscratch/i/worms/Celegans/trfFa/$(root1).fa} /iscratch/i/worms/Celegans/mrna.134/mrna.fa {check out line+ oi/$(root1).tab} #ENDLOOP '_EOF_' ssh kk cd /cluster/data/ce1/bed/mrnaOrientInfo gensub2 psl.lst mrna.lst gsub jobList para create jobList para try # there are only six jobs, this runs them cat oi/*.tab > mrnaOrientInfo.bed hgLoadBed ce1 mrnaOrientInfo mrnaOrientInfo.bed \ -sqlTable=$HOME/kent/src/hg/lib/mrnaOrientInfo.sql mkdir -p /cluster/data/ce1/bed/estOrientInfo/oi cd /cluster/data/ce1/bed/estOrientInfo ls -1S /cluster/bluearc/ce1/bed/est/contig/*.psl > psl.lst ls -1S /iscratch/i/worms/Celegans/mrna.134/est.fa > est.lst cat << '_EOF_' > gsub #LOOP /cluster/bin/i386/polyInfo {check in line $(path1)} {check in line+ /iscratch/i/worms/Celegans/trfFa/$(root1).fa} /iscratch/i/worms/Celegans/mrna.134/est.fa {check out line+ oi/$(root1).tab} #ENDLOOP '_EOF_' ssh kkr1u00 cd /cluster/data/ce1/bed/estaOrientInfo gensub2 psl.lst est.lst gsub jobList para create jobList para try # there are only six jobs, this runs them cat oi/*.tab > estOrientInfo.bed bedSort estOrientInfo.bed estOrientInfo.bed hgLoadBed ce1 estOrientInfo estOrientInfo.bed \ -sqlTable=$HOME/kent/src/hg/lib/estOrientInfo.sql mkdir /cluster/data/ce1/bed/rnaCluster cd /cluster/data/ce1/bed/rnaCluster foreach f (I II III IV V X) echo "clusterRna ce1 /dev/null chr${f}.bed -chrom=${f}" clusterRna ce1 /dev/null chr${f}.bed -chrom=chr${f} end hgLoadBed ce1 rnaCluster *.bed hgsql -A -e "select chrom from chromInfo" ce1 | egrep -v chrom > chrom.tab (perl -e 'print "#\!/bin/sh\n"; while(<>) {chomp($_); printf "hgsql -A -e =select * from rnaCluster where chrom like \"$_\" order by chromStart, name= ce1 | egrep -v chromStart | cut -f2\-13 > $_.ce1.rnaCluster.bed\n";}' | sed -e "s/=/'/g") < chrom.tab > rnaCluster.sh mkdir -p /cluster/data/ce1/bed/altSplice/agx cd /cluster/data/ce1/bed/altSplice for c in I II III IV V X M do hgsql -e "create table chr${c}_intronEst as select chr${c}_est.* from chr${c}_est,estOrientInfo where (chr${c}_est.qName = estOrientInfo.name) and (estOrientInfo.intronOrientation != 0)" ce1 done for c in I II III IV V X M do hgsql -e "alter table chr${c}_intronEst add index bin (bin);" ce1 hgsql -e "alter table chr${c}_intronEst add index tStart (tStart);" ce1 hgsql -e "alter table chr${c}_intronEst add index qName (qName(12));" ce1 hgsql -e "alter table chr${c}_intronEst add index tEnd (tEnd);" ce1 echo "done chr${c}" done hgsql -A -e "select chrom from chromInfo" ce1 | egrep -v chrom > chrom.tab perl -e 'while(<>) {chomp($_); print "~/bin/i386/altSplice -db=ce1 -beds=../rnaCluster/$_.ce1.rnaCluster.bed -agxOut=agx/$_.ce1.rnaCluster.agx -consensus -weightMrna\n";}' < chrom.tab > altSplice.para.spec # these jobs are no big deal on the worm, just run this file: chmod +x altSplice.para.spec mkdir agx ./altSplice.para.spec cat agx/*.agx > altSplice.agx agxToBed altSplice.agx altSplice.bed cat << '_EOF_' >> altGraphX.sql CREATE TABLE altGraphX ( bin smallint unsigned not null, tName varchar(255) NOT NULL default '', tStart int(11) NOT NULL default '0', tEnd int(11) NOT NULL default '0', name varchar(255) NOT NULL default '', id int(10) unsigned NOT NULL auto_increment, strand char(2) NOT NULL default '', vertexCount int(10) unsigned NOT NULL default '0', vTypes longblob NOT NULL, vPositions longblob NOT NULL, edgeCount int(10) unsigned NOT NULL default '0', edgeStarts longblob NOT NULL, edgeEnds longblob NOT NULL, evidence longblob NOT NULL, edgeTypes longblob NOT NULL, mrnaRefCount int(11) NOT NULL default '0', mrnaRefs longblob NOT NULL, mrnaTissues longblob NOT NULL, mrnaLibs longblob NOT NULL, PRIMARY KEY (id), KEY tName (tName(16),tStart,tEnd) ); '_EOF_' # << emacs hgLoadBed -sqlTable=altGraphX.sql ce1 altGraphX altSplice.agx # trackDb entry: track altGraphX shortLabel Alt-Splice longLabel Alternative Splicing group rna priority 74.1 visibility pack type bed 3 canPack off # copy frame based view to goldenPath 2006-01-12 - Hiram cd /usr/local/apache/htdocs/goldenPath ln -s ceMay2003 ce1 cd /cluster/data/ce1/bed/altSites mkdir /usr/local/apache/htdocs/goldenPath/ce1/altGraphX sed -e "s#http://hgwdev-sugnet.cse.ucsc.edu##" links.html \ > /usr/local/apache/htdocs/goldenPath/ce1/altGraphX/links.html sed -e \ "s/Human Alt Splicing Conserved in Mouse/Alt Splicing in C. elegans/" \ -e "s/NM_005487/chrII:14689493-14690232/" \ -e "s#http://hgwdev-sugnet.cse.ucsc.edu##" \ frame.html \ > /usr/local/apache/htdocs/goldenPath/ce1/altGraphX/frame.html # GC 5 BASE WIGGLE TRACK (DONE - 2004-02-13 - Hiram) # working on eieio, the fileserver for this data. # was named gc20KBase to start with, became a 5 base dataset later # the generic trackDb html file is named gc20KBase.html ssh eieio mkdir /cluster/data/ce1/bed/gc20KBase cd /cluster/data/ce1/bed/gc20KBase mkdir wigData5 dataLimits5 for n in ../../nib/*.nib do c=`basename ${n} | sed -e "s/.nib//"` C=`echo $c | sed -e "s/chr//"` echo -n "working on ${c} - ${C} ... " hgGcPercent -chr=${c} -doGaps \ -file=stdout -win=5 ce1 ../../nib | grep -w GC | awk ' { bases = $3 - $2 perCent = $5/10.0 printf "%d\t%.1f\n", $2+1, perCent }' | wigAsciiToBinary -dataSpan=5 -chrom=${c} \ -wibFile=wigData5/gc5Base_${C} -name=${C} stdin > dataLimits5/${c} echo "done ${c}" done # data is compete, load track on hgwdev ssh hgwdev cd /cluster/data/ce1/bed/gc20KBase hgLoadWiggle ce1 gc20KBase wigData5/*.wig # create symlinks for .wib files mkdir /gbdb/ce1/wib ln -s `pwd`/wigData5/*.wib /gbdb/ce1/wib # the trackDb entry track gc20KBase shortLabel GC Percent longLabel GC Percent in 5 base windows group map priority 23.5 visibility hide autoScale Off maxHeightPixels 128:36:16 graphTypeDefault Bar gridDefault OFF windowingFunction Mean color 0,128,255 altColor 255,128,0 viewLimits 30:70 type wig 0 100 # miRNA track (DONE - 2004-03-02 - Hiram) # data from: Sam Griffiths-Jones # and Michel.Weber@ibcg.biotoul.fr # notify them if this assembly updates to renew this track mkdir /cluster/data/ce1/bed/miRNA cd /cluster/data/ce1/bed/miRNA wget --timestamping \ "http://www.sanger.ac.uk/Software/Rfam/mirna/data/ucsc_cel_mir_track.txt" grep "^chr" ucsc_cel_mir_track.txt | sed -e "s/ /\t/g" > worm.mir.bed hgLoadBed ce1 miRNA worm.mir.bed # entry in trackDb/trackDb.ra already there for same track in Hg16 # see makeHg16.doc for more comments on this ##################################################################### # SEGMENTAL DUPLICATIONS (DONE 6/30/06 angie) # File emailed from Xinwei She mkdir /cluster/data/ce1/bed/genomicSuperDups cd /cluster/data/ce1/bed/genomicSuperDups sed -e 's/CHROMOSOME_/chr/g; s/\t_\t/\t-\t/;' \ ce1_genomicSuperDup.tab \ | awk '($3 - $2) >= 1000 && ($9 - $8) >= 1000 {print;}' \ | hgLoadBed ce1 genomicSuperDups stdin \ -sqlTable=$HOME/kent/src/hg/lib/genomicSuperDups.sql