# for emacs: -*- mode: sh; -*- # Caenorhabditis elegans # Washington University School of Medicine GSC and Sanger Institute WS190 # $Id: ce6.txt,v 1.19 2010/04/15 18:03:15 chinhli Exp $ ######################################################################### # DOWNLOAD SEQUENCE (DONE - 2008-05-30 - Hiram) ssh kkstore06 mkdir /cluster/store3/worm/ce6 ln -s /cluster/store3/worm/ce6 /cluster/data/ce6 mkdir /cluster/data/ce6/downloads cd /cluster/data/ce6/downloads for C in I II III IV V X MtDNA do F=WS190/CHROMOSOMES/CHROMOSOME_${C} wget --timestamping \ "ftp://ftp.sanger.ac.uk/pub/wormbase/${F}.agp" wget --timestamping \ "ftp://ftp.sanger.ac.uk/pub/wormbase/${F}.dna.gz" wget --timestamping \ "ftp://ftp.sanger.ac.uk/pub/wormbase/${F}.gff.gz" done ######################################################################### # NORMALIZE SEQUENCE NAMES TO BEGIN WITH chr (DONE - 2008-05-30 - Hiram) ssh kkstore06 cd /cluster/data/ce6/downloads # Fix fasta names: zcat CHR*.dna.gz \ | sed -e '/^$/ d; s/^>CHROMOSOME_MtDNA/>chrM/; s/^>CHROMOSOME_/>chr/;' \ | gzip -c > UCSC.fa.gz faSize -detailed UCSC.fa.gz # chrI 15072421 # chrII 15279323 # chrIII 13783681 # chrIV 17493785 # chrM 13794 # chrV 20919568 # chrX 17718854 # Make sure we get the same sizes from this command: zcat CHR*.dna.gz | sed -e '/^$/ d;' | faSize -detailed stdin # Fix AGP names: sed -e 's/^/chr/' CHR*.agp > UCSC.agp # And add a fake mitochondrial AGP entry for the sake of downstream # tools (make sure the GenBank sequence is identical to given): echo -e "chrM\t1\t13794\t1\tF\tNC_001328.1\t1\t13794\t+" >> UCSC.agp ######################################################################### # run the makeGenomeDb procedure to create the db and unmasked sequence # (DONE - 2008-05-30 - Hiram) ssh kkstore06 cd /cluster/data/ce6 cat << '_EOF_' > ce6.config.ra # Config parameters for makeGenomeDb.pl: db ce6 clade worm genomeCladePriority 10 scientificName Caenorhabditis elegans commonName C. elegans assemblyDate May 2008 assemblyLabel Washington University School of Medicine GSC and Sanger Institute WS190 orderKey 826 mitoAcc none fastaFiles /cluster/data/ce6/downloads/UCSC.fa.gz agpFiles /cluster/data/ce6/downloads/UCSC.agp # qualFiles /dev/null dbDbSpeciesDir worm '_EOF_' # << emacs mkdir jkStuff # run just to AGP to make sure things are sane first nice makeGenomeDb.pl ce6.config.ra -stop agp \ > jkStuff/makeGenomeDb.agp 2>&1 # now, continuing to make the Db and all time nice -n +19 makeGenomeDb.pl ce6.config.ra -continue db \ > jkStuff/makeGenomeDb.log 2>&1 # that takes 2m30s to run to completion # take the trackDb business there and check it into the source tree # fixup the description, gap and gold html page descriptions ######################################################################### # REPEATMASKER (DONE - 2008-05-30 - Hiram) ssh kkstore06 screen # use screen to control the job mkdir /cluster/data/ce6/bed/repeatMasker cd /cluster/data/ce6/bed/repeatMasker time nice -n +19 doRepeatMasker.pl -bigClusterHub=pk \ -buildDir=/cluster/data/ce6/bed/repeatMasker ce6 > do.log 2>&1 & # real 133m17.088s # from the do.log: # RepeatMasker version development-$Id: # RepeatMasker,v 1.20 2008/01/16 18:15:45 angie Exp $ # CC RELEASE 20071204; * ######################################################################### # SIMPLE REPEATS (DONE - 2008-05-30 - Hiram) ssh kkstore06 screen # use screen to control the job mkdir /cluster/data/ce6/bed/simpleRepeat cd /cluster/data/ce6/bed/simpleRepeat time nice -n +19 doSimpleRepeat.pl -smallClusterHub=kki \ -buildDir=/cluster/data/ce6/bed/simpleRepeat ce6 > do.log 2>&1 & # about 30 minutes ######################################################################### # MASK SEQUENCE WITH RM+TRF (DONE - 2008-05-30 - Hiram) # Since both doRepeatMasker.pl and doSimpleRepeats.pl have completed, # now it's time to combine the masking into the final ce6.2bit, # following the instructions at the end of doSimpleRepeat's output. ssh kolossus cd /cluster/data/ce6 twoBitMask ce6.rmsk.2bit -add bed/simpleRepeat/trfMask.bed ce6.2bit # You can safely ignore the warning about extra BED columns twoBitToFa ce6.2bit stdout | faSize stdin # 100281426 bases (0 N's 100281426 real 86981803 upper 13299623 lower) # in 7 sequences in 1 files # Total size: mean 14325918.0 sd 6728308.1 min 13794 (chrM) # max 20919568 (chrV) median 15279323 # %13.26 masked total, %13.26 masked real # set the symlink on hgwdev to /gbdb/ce6 cd /gbdb/ce6 ln -s /cluster/data/ce6/ce6.2bit . ######################################################################### # MAKE 11.OOC FILE FOR BLAT (DONE - 2008-05-30 - Hiram) # Use -repMatch=100 (based on size -- for human we use 1024, and # worm size is ~3.4% of human judging by gapless ce4 vs. hg18 genome # size from featureBits. So we would use 34, but that yields a very # high number of tiles to ignore, especially for a small more compact # genome. Bump that up a bit to be more conservative. ssh kolossus cd /cluster/data/ce6 blat ce6.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=jkStuff/11.ooc -repMatch=100 # Wrote 8523 overused 11-mers to jkStuff/11.ooc # copy all of this stuff to the Iservers ssh kkr1u00 mkdir /iscratch/i/worms/ce6 cd /iscratch/i/worms/ce6 cp -p /cluster/data/ce6/ce6.2bit . cp -p /cluster/data/ce6/jkStuff/11.ooc . cp -p /cluster/data/ce6/chrom.sizes . - cp -p jkStuff/11.ooc /san/sanvol1/scratch/worms/ce6 ######################################################################### # GENBANK AUTO UPDATE (DONE - 2008-05-30 - Hiram) # align with latest genbank process. ssh hgwdev cd ~/kent/src/hg/makeDb/genbank cvsup # edit etc/genbank.conf to add ce6 just before ce4 # ce6 (C. elegans) ce6.serverGenome = /cluster/data/ce6/ce6.2bit ce6.clusterGenome = /iscratch/i/worms/ce6/ce6.2bit ce6.ooc = /iscratch/i/worms/ce6/11.ooc ce6.refseq.mrna.native.pslCDnaFilter = ${finished.refseq.mrna.native.pslCDnaFilter} ce6.refseq.mrna.xeno.pslCDnaFilter = ${finished.refseq.mrna.xeno.pslCDnaFilter} ce6.genbank.mrna.native.pslCDnaFilter = ${finished.genbank.mrna.native.pslCDnaFilter} ce6.genbank.mrna.xeno.pslCDnaFilter = ${finished.genbank.mrna.xeno.pslCDnaFilter} ce6.genbank.est.native.pslCDnaFilter = ${finished.genbank.est.native.pslCDnaFilter} ce6.maxIntron = 100000 ce6.lift = no ce6.genbank.mrna.xeno.load = yes ce6.refseq.mrna.xeno.load = yes ce6.downloadDir = ce6 cvs ci -m "Added ce6." etc/genbank.conf # update /cluster/data/genbank/: make etc-update ssh genbank screen # use a screen to manage this job cd /cluster/data/genbank time nice -n +19 bin/gbAlignStep -initial ce6 & # logFile: var/build/logs/2008.05.30-15:04:49.ce6.initalign.log # real 137m58.675s # real 61m33.930s # load database when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad ce6 # logFile: var/dbload/hgwdev/logs/2008.05.30-20:29:01.dbload.log # real 23m8.651s # enable daily alignment and update of hgwdev cd ~/kent/src/hg/makeDb/genbank cvsup # add ce6 to: etc/align.dbs etc/hgwdev.dbs cvs ci -m "Added ce6 - C. elegans WS190" \ etc/align.dbs etc/hgwdev.dbs make etc-update ############################################################################### ## SANGER GENE TRACK (DONE - 2008-06-03 - Hiram) ## There is a tremendous amount of extraneous notations in the gff ## files. Filter them down to a manageable set, change the chrom ## names, eliminate duplicates, select only what will work in ## ldHgGene ssh kkstore06 mkdir /cluster/data/ce6/bed/sangerGene cd /cluster/data/ce6/bed/sangerGene for C in I II III IV V X do echo -n "${C} " zcat ../../downloads/CHROMOSOME_${C}.gff.gz | \ sed -e "s/CHROMOSOME_III/chrIII/g; s/CHROMOSOME_II/chrII/g; \ s/CHROMOSOME_IV/chrIV/g; s/CHROMOSOME_I/chrI/g; \ s/CHROMOSOME_X/chrX/g; s/CHROMOSOME_V/chrV/g; \ s/CHROMOSOME_M/chrM/g;" \ -e 's/Sequence "\(.*\)"$/\1/' -e 's/Transcript "\(.*\)"$/\1/' \ -e 's/CDS "//' -e 's/"//' \ > chr${C}.gff done C=M echo -n "${C} " zcat ../../downloads/CHROMOSOME_MtDNA.gff.gz | \ sed -e "s/CHROMOSOME_III/chrIII/g; s/CHROMOSOME_II/chrII/g; \ s/CHROMOSOME_IV/chrIV/g; s/CHROMOSOME_I/chrI/g; \ s/CHROMOSOME_X/chrX/g; s/CHROMOSOME_V/chrV/g; \ s/CHROMOSOME_M/chrM/g; s/chrMtDNA/chrM/g;" \ -e 's/Sequence "\(.*\)"$/\1/' -e 's/Transcript "\(.*\)"$/\1/' \ -e 's/CDS "//' -e 's/"//' \ > chr${C}.gff for C in I II III IV V X M do echo "chr${C}.gff -> filtered.chr${C}.gff" grep -v "^#" chr${C}.gff | awk -F'\t' ' BEGIN { IGNORECASE=1 } { if (match($2,"curated|Coding_transcript")) { if (match($3,"intron|coding_exon|exon|cds|three_prime_UTR|five_prime_UTR")) { gsub("coding_exon","CDS",$3) gsub("Transcript ","",$9) gsub(" .*","",$9) gsub("three_prime_UTR","3utr",$3) gsub("five_prime_UTR","5utr",$3) for (i = 1; i < 9; ++i) { printf "%s\t", $i } printf "%s\n", $9 } } } ' | sort -u | sort -k4n > filtered.chr${C}.gff done ssh hgwdev cd /cluster/data/ce6/bed/sangerGene nice -n +19 ldHgGene ce6 sangerGene filtered.*.gff nice -n +19 ldHgGene -out=filteredGenePred.tab ce6 sangerGene filtered.*.gff # Read 52512 transcripts in 997038 lines in 7 files # 52512 groups 7 seqs 3 sources 5 feature types # 30296 gene predictions ## this doesn't work with the bin column, so alter that table hgsql -e "alter table sangerGene drop column bin;" ce6 ## found when trying to click through on sequence links, mRNA or ## protein when on a gene details page ## Create orfToGene table. The tar file containing this information # was obtained from Anthony Rogers via Email 2008-06-03 ar2@sanger.ac.uk # They had one duplicate line in their file: sort -u /cluster/data/ce6/downloads/tarFile/gene_name.txt \ | sed -e "s/ */\t/" > orfToGene.tab ## plus, they didn't include the self with self references, which are ## needed in the gene sorter so it can find its names. So, fixing # this table: hgsql -N -e "select name from sangerGene;" ce6 | sort > name.sangerGene.txt cat << '_EOF_' > fixupOrf.pl #!/usr/bin/env perl use strict; use warnings; my %sangerGeneName; open (FH, ") { chomp $line; $sangerGeneName{$line} = 1; } close (FH); open (FH,") { chomp $line; my ($alias, $real) = split('\s+', $line); die "duplicate alias name found: $alias" if (exists($alias{$alias})); if (exists($sangerGeneName{$alias})) { $alias{$alias} = $real; $realNames{$real} = 1; my ($a, $b, $c, $d) = split('\.',$alias); die "four dot name found $line" if (defined($d)); my $twoDotName = sprintf "%s.%s", $a, $b; if (exists($sangerGeneName{$twoDotName})) { $twoDot{$twoDotName} = $real; } } } close (FH); my $count = 0; my $noReal = 0; my %needReal; foreach my $key (keys %alias) { ++$count; my $realName = $alias{$key}; if (!exists($alias{$realName})) { if (!exists($needReal{$realName}) && exists($sangerGeneName{$realName})) { ++$noReal; $needReal{$realName} = $key; my ($a, $b, $c, $d) = split('\.',$realName); die "four dot name found $realName" if (defined($d)); my $twoDotName = sprintf "%s.%s", $a, $b; if (exists($sangerGeneName{$twoDotName})) { $twoDot{$twoDotName} = $key; } } } } print STDERR "found $count alias names\n"; print STDERR "can not find $noReal real names in alias list\n"; $count = 0; foreach my $key (keys %realNames) { ++$count; } print STDERR "found $count real names\n"; $noReal = 0; foreach my $key (keys %needReal) { ++$noReal; print STDERR "$key $needReal{$key}\n" if ($noReal < 10); } print STDERR "need to add $noReal real names\n"; open (FH,">fixedOrfToGene.tab") or die "can not write to fixedOrfToGene.tab"; foreach my $key (keys %alias) { printf FH "%s\t%s\n", $key, $alias{$key}; } foreach my $key (keys %needReal) { printf FH "%s\t%s\n", $key, $key; } foreach my $key (keys %twoDot) { if ( (!exists($alias{$key})) && (!exists($needReal{$key})) ) { print STDERR "need name for $key $twoDot{$key}\n"; printf FH "%s\t%s\n", $key, $twoDot{$key}; } } foreach my $key (keys %sangerGeneName) { if ( (!exists($alias{$key})) && (!exists($needReal{$key})) ) { if ( !exists($twoDot{$key}) ) { print STDERR "last chance need name for $key $key\n"; printf FH "%s\t%s\n", $key, $key; } } } close (FH); '_EOF_' # << happy emacs chmod +x fixupOrf.pl ./fixupOrf.pl > fixupOrf.out 2>&1 hgLoadSqlTab ce6 orfToGene ~/kent/src/hg/lib/orfToGene.sql \ fixedOrfToGene.tab ln -s /cluster/data/ce6/downloads/tarFile/paragraph.txt ./paragraph.tab ln -s /cluster/data/ce6/downloads/tarFile/uniprot.txt ./uniprot.tab cat << '_EOF_' > mkSangerLinks.pl #!/usr/bin/env perl use strict; use warnings; my %geneToOrf; # key is uc(gene name), value is orf my %orfToGene; # key is uc(orf), value is gene name my $orfCount = 0; open (FH, ") { chomp $line; my ($orf, $gene) = split('\t',$line); die "gene name zero length" if (length($gene) < 1); die "orf name zero length" if (length($orf) < 1); $geneToOrf{uc($gene)} = $orf; $orfToGene{uc($orf)} = $gene; ++$orfCount; } close (FH); printf STDERR "read $orfCount orf designations from orfToGene.tab\n"; open (LS,">sangerLinks.tab") or die "can not write to sangerLinks.tab"; my $geneCount = 0; open (FH, "sort -u paragraph.tab|") or die "can not read paragraph.tab"; while (my $line = ) { chomp $line; my ($orf, $descr) = split('\t',$line); die "gene name zero length" if (length($orf) < 1); die "can not find orfToGene $orf" if (!exists($orfToGene{uc($orf)})); my $orfGene = $orfToGene{uc($orf)}; die "orfGene name zero length" if (length($orfGene) < 1); printf LS "%s\t%s\t%s\n", $orf, $orfGene, $descr; ++$geneCount; } close (FH); printf STDERR "read $geneCount gene descriptions from paragraph.tab\n"; close (LS); '_EOF_' # << happy emacs chmod +x mkSangerLinks.pl ./mkSangerLinks.pl # read 33792 orf designations from orfToGene.tab # read 33793 gene descriptions from paragraph.tab # There is problem in paragraph.tab, it contains a duplicate entry: F57C7.4 F57C7.4 contains similarity to Interpro domain IPR008972 () F57C7.4 F57C7.4 contains similarity to Plasmodium falciparum Hypothetical protein.\; TR:Q8I5U3 # manually fixup this one to elimiate the second definition hgLoadSqlTab ce6 sangerLinks ~/kent/src/hg/lib/sangerLinks.sql \ sangerLinks.tab # Add proteinID field to sangerGene table, used by Gene Sorter hgsql -e 'alter table sangerGene add proteinID varchar(40) NOT NULL;' ce6 # To add index on this column hgsql -e 'alter table sangerGene add index(proteinID);' ce6 # Use SwissProt IDs in sangerLinks table to fill proteinID # column in the sangerGene table hgsql -e 'update sangerGene as g, sangerLinks as l \ set g.proteinID = l.protName where g.name = l.orfName;' ce6 # check there are rows with the protName field filled hgsql -N -e "select proteinId from sangerGene" ce6 | sort | uniq -c | wc -l # 20127 hgLoadSqlTab ce6 sangerLinks ~/kent/src/hg/lib/sangerLinks.sql \ sangerLinks.tab wget --timestamping \ "ftp://ftp.sanger.ac.uk/pub/wormbase/WS190/wormpep190.tar.gz" wget --timestamping \ "ftp://ftp.sanger.ac.uk/pub/wormbase/WS190/wormrna190.tar.gz" tar xvzf wormrna190.tar.gz mv README README.wormrna190 tar xvzf wormpep190.tar.gz # creates files in directory wormpep190: # -rw-rw-r-- 1 32316202 Apr 7 06:38 wormpep.dna190 # -rw-rw-r-- 1 980303 Apr 7 06:41 wormpep.history190 # -rw-rw-r-- 1 4826 Apr 7 06:41 wormpep.diff190 # -rw-rw-r-- 1 19139454 Apr 7 06:41 wormpep.fasta190 # -rw-rw-r-- 1 398315 Apr 7 06:41 wormpep.accession190 # -rw-rw-r-- 1 13244499 Apr 15 06:23 wormpep190 # -rw-rw-r-- 1 2608936 Apr 15 06:23 wormpep.table190 hgPepPred ce6 generic sangerPep wormpep190/wormpep190 ############################################################################### ######## MAKING GENE SORTER TABLES #### (DONE - 2008-06-04 - Hiram) # These are instructions for building the # Gene Sorter browser. Don't start these until # there is a sangerGene table with a proteinID column containing Swiss-Prot # protein IDs, and also Kim lab expression data is required (in hgFixed). # Cluster together various alt-splicing isoforms. # Creates the sangerIsoforms and sangerCanonical tables ssh hgwdev hgClusterGenes ce6 sangerGene sangerIsoforms sangerCanonical # Got 20051 clusters, from 30296 genes in 7 chromosomes featureBits ce6 sangerCanonical # 57412493 bases of 100281426 (57.251%) in intersection featureBits ce4 sangerCanonical # 57092312 bases of 100281244 (56.932%) in intersection featureBits ce2 sangerCanonical # 54156601 bases of 100291761 (53.999%) in intersection featureBits ce1 sangerCanonical # 53654286 bases of 100277784 (53.506%) in intersection # Create Self blastp table - sangerBlastTab # Get sangerPep peptide sequences fasta file from sangerPep dir # and create a blast database out of them. mkdir -p /cluster/data/ce6/bed/blastp cd /cluster/data/ce6/bed/blastp cat << '_EOF_' > pepTabToFa.pl #!/usr/bin/env perl use strict; use warnings; open (FH,"<../sangerGene/sangerPep.tab") or die "can not open ../sangerGene/sangerPep.tab"; while (my $line = ) { chomp $line; my ($name,$seq) = split('\s+',$line); printf ">%s\n%s\n", $name, $seq; } close (FH); '_EOF_' # << happy emacs chmod +x ./pepTabToFa.pl ./pepTabToFa.pl > wormPep190.faa /cluster/bluearc/blast229/formatdb -i wormPep190.faa \ -t wormPep190 -n wormPep190 # Copy database over to kluster storage mkdir -p /san/sanvol1/scratch/worms/ce6/blastp rsync -a ./ /san/sanvol1/scratch/worms/ce6/blastp/ cd /cluster/data/ce6/bed/blastp mkdir split faSplit sequence wormPep190.faa 1000 split/wp # Make parasol run directory ssh pk mkdir -p /cluster/data/ce6/bed/blastp/self cd /cluster/data/ce6/bed/blastp/self mkdir run cd run mkdir out # Make blast script cat << '_EOF_' > blastSome.csh #!/bin/csh -ef setenv BLASTMAT /cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall -p blastp \ -d /san/sanvol1/scratch/worms/ce6/blastp/wormPep190 \ -i $1 -o $2 -e 0.01 -m 8 -b 1000 '_EOF_' # << happy emacs chmod a+x blastSome.csh # Make gensub2 file cat << '_EOF_' > gsub #LOOP blastSome.csh {check in line+ $(path1)} {check out line out/$(root1).tab} #ENDLOOP '_EOF_' # << Create parasol batch ls ../../split/*.fa > split.lst gensub2 split.lst single gsub jobList para create jobList para try.... etc ... # Completed: 986 of 986 jobs # CPU time in finished jobs: 18187s 303.12m 5.05h 0.21d 0.001 y # IO & Wait Time: 2713s 45.22m 0.75h 0.03d 0.000 y # Average job time: 21s 0.35m 0.01h 0.00d # Longest finished job: 73s 1.22m 0.02h 0.00d # Submission to last job: 302s 5.03m 0.08h 0.00d # Load into database. ssh hgwdev cd /cluster/data/ce6/bed/blastp/self/run/out hgLoadBlastTab ce6 sangerBlastTab *.tab # Scanning through 986 files # Loading database with 1255312 rows # CREATE EXPRESSION DISTANCE TABLES FOR # KIM LAB EXPRESSION DATA (this appears to be an expensive command, # about 10 to 15 minutes or so ?) hgExpDistance ce6 hgFixed.kimWormLifeMedianRatio \ hgFixed.kimWormLifeMedianExps kimExpDistance # Have 19134 elements in hgFixed.kimWormLifeMedianRatio # Got 19134 unique elements in hgFixed.kimWormLifeMedianRatio # CREATE TABLE TO MAP BETWEEN SANGERGENES AND REFSEQ GENES # sangerToRefSeq hgMapToGene ce6 refGene sangerGene sangerToRefSeq # SET dbDb TABLE ENTRY FOR GENE SORTER (DONE, 2004-05-25, hartera) # set hgNearOk to 1 on hgcentraltest to make Ce6 Gene Sorter viewable hgsql -e 'update dbDb set hgNearOk = 1 where name = "ce6";' \ hgcentraltest # Running joinerCheck to see what is sane: cd ~/kent/src/hg/makeDb/schema joinerCheck -identifier=wormBaseId -database=ce6 -times -keys ./all.joiner # ce6.orfToGene.name - hits 30296 of 30296 ok # ce6.sangerBlastTab.query - hits 1255312 of 1255312 ok # ce6.sangerBlastTab.target - hits 1255312 of 1255312 ok # ce6.sangerCanonical.transcript - hits 20051 of 20051 ok # ce6.sangerIsoforms.transcript - hits 30296 of 30296 ok # ce6.sangerLinks.orfName - hits 30115 of 30115 ok # ce6.sangerPep.name - hits 23771 of 23771 ok # ce6.sangerToRefSeq.name - hits 29995 of 29995 ok ## create a gene name to WBGene ID reference table ## to be used to construct URL mkdir /cluster/data/ce6/bed/WBGeneID cd /cluster/data/ce6/bed/WBGeneID # There was one duplicate entry in the file, the sort -u eliminates it sed -e "s#http://wormbase.org/db/gene/gene?name=##; s#;class=Gene##" \ ../../downloads/tarFile/url.txt | sort -u > sangerGeneToWBGeneID.tab # add in the one-dot names for those names that are only two-dot names # in this file: cat << '_EOF_' > addDotName.pl #!/usr/bin/env perl use strict; use warnings; my %sangerGeneName; open (FH, "sangerGene.name") or die "can not read sangerGene.name"; while (my $line = ) { chomp $line; $sangerGeneName{$line} = 1; } close (FH); open (FH,") { chomp $line; my ($name, $wbId) = split('\s+',$line); die "duplicate gene name $name $wbId" if (exists($twoDotNames{$name})); if (exists($sangerGeneName{$name})) { $twoDotNames{$name} = $wbId; } } close (FH); my %neededNames; foreach my $key (keys %twoDotNames) { my ($a, $b, $c, $d) = split('\.',$key); my $wbId = $twoDotNames{$key}; die "three dot name found $key $wbId" if (defined($d)); if (defined($c)) { my $oneDot = sprintf "%s.%s", $a, $b; if (!exists($twoDotNames{$oneDot})) { if (!exists($neededNames{$oneDot})) { $neededNames{$oneDot} = $wbId; } } } } foreach my $key (keys %twoDotNames) { printf "%s\t%s\n", $key, $twoDotNames{$key}; } foreach my $key (keys %neededNames) { printf "%s\t%s\n", $key, $neededNames{$key}; } '_EOF_' # << happy emacs chmod +x addDotName.pl hgsql -N -e "select name from sangerGene;" ce6 | sort -u > sangerGene.name ./addDotName.pl > fixed.sangerGeneToWBGeneID.tab hgLoadSqlTab ce6 sangerGeneToWBGeneID \ ~/kent/src/hg/lib/sangerGeneToWBGeneID.sql \ fixed.sangerGeneToWBGeneID.tab cd ~/kent/src/hg/makeDb/schema joinerCheck -identifier=wormBaseId -database=ce6 -times -keys ./all.joiner # ce6.orfToGene.name - hits 30296 of 30296 ok # ce6.sangerBlastTab.query - hits 1255312 of 1255312 ok # ce6.sangerBlastTab.target - hits 1255312 of 1255312 ok # ce6.sangerCanonical.transcript - hits 20051 of 20051 ok # ce6.sangerIsoforms.transcript - hits 30296 of 30296 ok # ce6.sangerLinks.orfName - hits 30115 of 30115 ok # ce6.sangerPep.name - hits 23771 of 23771 ok # ce6.sangerToRefSeq.name - hits 29995 of 29995 ok # ce6.sangerGeneToWBGeneID.sangerGene - hits 30115 of 30115 ok ############################################################################ ## fixup sangerLinks table (DONE - 2008-06-03 - Hiram) ## it is missing descriptions for the one dot names where only two dot ## names existed mkdir /cluster/data/ce6/bed/fixupSangerLinks cd /cluster/data/ce6/bed/fixupSangerLinks ln -s ../sangerGene/sangerLinks.tab . cat << '_EOF_' > slFixup.pl #!/usr/bin/env perl use strict; use warnings; open (FH,") { chomp $line; my ($name, $orf, $desc) = split('\t',$line); my ($a, $b, $c, $d) = split('\.', $name); die "ERROR: found three dot name at $line" if (defined($d)); if (defined($c)) { my $oneDotName = sprintf "%s.%s", $a, $b; if (exists($newNeeded{$oneDotName})) { my $newString = sprintf "%s\t%s", $orf, $desc; die "ERROR: new one dot name, different descr $name" if ($newNeeded{$oneDotName} ne $newString) } else { $newNeeded{$oneDotName} = sprintf "%s\t%s", $orf, $desc; ++$newCount; } } die "ERROR: duplicate name $name at $line" if (exists($existing{$name})); $existing{$name} = sprintf "%s\t%s", $orf, $desc; } close (FH); foreach my $key (keys %newNeeded) { die "ERROR: duplicate new name $key for $newNeeded{$key}" if (exists($existing{$key})); printf "%s\t%s\n", $key, $newNeeded{$key}; } foreach my $key (keys %existing) { die "ERROR: duplicate existing name $key for $existing{$key}" if (exists($newNeeded{$key})); printf "%s\t%s\n", $key, $existing{$key}; } '_EOF_' # << happy emacs chmod +x ./slFixup.pl ./slFixup.pl > fixedSangerLinks.tab hgLoadSqlTab ce6 sangerLinks ~/kent/src/hg/lib/sangerLinks.sql \ fixedSangerLinks.tab # It still has a difficulty because it contains references to the RNA # genes. joinerCheck complains. So, after loading sangerRnaGene # below, remove the entries in sangerLinks that are in sangerRnaGene. # I tried to figure out a delete statement that would do this, but # to no avail. So, brain dead list management: hgsql -N -e "select orfName from sangerLinks;" ce6 \ | sort -u > sangerLinks.orfName hgsql -N -e "select name from sangerRnaGene;" ce6 \ | sort -u > sangerRnaGene.name # before delete: hgsql -e "select count(*) from sangerLinks;" ce6 # 36555 comm -12 sangerLinks.orfName sangerRnaGene.name | while read N do hgsql -e "delete from sangerLinks where orfName=\"$N\";" ce6 echo $N done # after delete: hgsql -e "select count(*) from sangerLinks;" ce6 # 30115 # 36555 - 30115 = 6440 ############################################################################### ## RNA sangerGenes (DONE - 2008-06-03 - Hiram) ssh kkstore06 cd /cluster/data/ce6/bed/sangerGene for C in I II III IV V X M do echo "chr${C}.gff -> rna.chr${C}.gff" grep -v "^#" chr${C}.gff | awk -F'\t' ' BEGIN { IGNORECASE=1 } { if (match($2,"^miRNA|^ncRNA|^rRNA|^scRNA|^snRNA|^snlRNA|^snoRNA|^tRNA|^tRNAscan-SE-1.23")) { if (match($3,"exon")) { gsub("Transcript ","",$9) gsub(" .*","",$9) for (i = 1; i < 9; ++i) { printf "%s\t", $i } printf "%s\n", $9 } } } ' | sort -u | sort -k4n > rna.chr${C}.gff done for C in I II III IV V X M do echo "chr${C}.gff -> pseudoGene.chr${C}.gff" grep -v "^#" chr${C}.gff | awk -F'\t' ' BEGIN { IGNORECASE=1 } { if (match($2,"^Pseudogene")) { if (match($3,"Pseudogene")) { gsub("Pseudogene ","",$9) gsub("Pseudogene","CDS",$3) gsub(" .*","",$9) gsub("\".*","",$9) for (i = 1; i < 9; ++i) { printf "%s\t", $i } printf "%s\n", $9 } } } ' | sort -u | sort -k4n > pseudoGene.chr${C}.gff done ssh hgwdev cd /cluster/data/ce6/bed/sangerGene time nice -n +19 ldHgGene ce6 sangerRnaGene rna.*.gff # Read 6440 transcripts in 6474 lines in 7 files # 6440 groups 7 seqs 9 sources 1 feature types # 6440 gene predictions time nice -n +19 ldHgGene ce6 sangerPseudoGene pseudoGene.*.gff # Read 1462 transcripts in 1462 lines in 7 files # 1462 groups 6 seqs 1 sources 1 feature types # 1462 gene predictions ############################################################################ # BLATSERVERS ENTRY (DONE - 2008-06-04 - Hiram) # After getting a blat server assigned by the Blat Server Gods, ssh hgwdev hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("ce6", "blat9", "17788", "1", "0"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("ce6", "blat9", "17789", "0", "1");' \ hgcentraltest # test it with some sequence ############################################################################ # reset default position, same as ce4 on the ZC101 / unc-52 locus ssh hgwdev hgsql -e 'update dbDb set defaultPos="chrII:14646344-14667746" where name="ce6";' hgcentraltest ########################################################################## ## Blastz SELF (DONE - 2008-06-04 - Hiram ssh kkstore06 # screen mkdir /cluster/data/ce6/bed/blastzCe6.2008-06-04 cd /cluster/data/ce6/bed ln -s blastzCe6.2008-06-04 blastzSelf cd blastzCe6.2008-06-04 cat << '_EOF_' > DEF # ce6 vs ce6 - Self alignment BLASTZ_H=2000 BLASTZ_M=200 # TARGET: elegans Ce6 SEQ1_DIR=/iscratch/i/worms/ce6/ce6.2bit SEQ1_LEN=/iscratch/i/worms/ce6/chrom.sizes SEQ1_CHUNK=1000000 SEQ1_LAP=10000 # QUERY: elegans Ce6 SEQ2_DIR=/iscratch/i/worms/ce6/ce6.2bit SEQ2_LEN=/iscratch/i/worms/ce6/chrom.sizes SEQ2_CHUNK=1000000 SEQ2_LAP=10000 BASE=/cluster/data/ce6/bed/blastzCe6.2008-06-04 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl DEF -verbose=2 \ -chainMinScore=10000 -chainLinearGap=medium -bigClusterHub=kk \ -smallClusterHub=kki > do.log 2>&1 & # real 79m26.707s XXX - running 2008-06-04 12:20 # broke down during loadUp because /iscratch/i/worms/ce6/chrom.sizes # does not exist on hgwdev. Finish the load manually by fixing # the chrom.sizes reference in loadUp.csh. Then: time nice -n +19 doBlastzChainNet.pl DEF -verbose=2 \ -chainMinScore=10000 -chainLinearGap=medium -bigClusterHub=kk \ -continue=download -smallClusterHub=kki > download.log 2>&1 & i # real 37m22.066s ssh hgwdev cd /cluster/data/ce6/bed/blastz.ce6.2008-06-04 nice -n +19 featureBits ce6 chainSelfLink > fb.ce6.chainSelfLink.txt 2>&1 # 31186593 bases of 100281244 (31.099%) in intersection ############################################################################ ## BLASTZ caePb2 (DONE - 2008-06-04,09 - Hiram) ssh kkstore06 screen # use screen to control the job mkdir /cluster/data/ce6/bed/blastzCaePb2.2008-06-04 cd /cluster/data/ce6/bed/blastzCaePb2.2008-06-04 cat << '_EOF_' > DEF # ce6 vs caePb2 BLASTZ_H=2000 BLASTZ_M=50 # TARGET: elegans Ce6 SEQ1_DIR=/scratch/data/ce6/ce6.2bit SEQ1_LEN=/scratch/data/ce6/chrom.sizes SEQ1_CHUNK=1000000 SEQ1_LAP=10000 # QUERY: C. PB2801 caePb2 SEQ2_DIR=/scratch/data/caePb2/caePb2.2bit SEQ2_LEN=/scratch/data/caePb2/chrom.sizes SEQ2_CTGDIR=/scratch/data/caePb2/caePb2.supercontigs.2bit SEQ2_CTGLEN=/scratch/data/caePb2/caePb2.supercontigs.sizes SEQ2_LIFT=/scratch/data/caePb2/caePb2.supercontigs.lift SEQ2_CHUNK=1000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/cluster/data/ce6/bed/blastzCaePb2.2008-06-04 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF -verbose=2 -bigClusterHub=kk \ -smallClusterHub=kki > do.log 2>&1 & # real 32m29.730s # trouble with chaining due to no copies of genomes on /scratch/data/ # for the Iservers time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF -verbose=2 -bigClusterHub=kk \ -continue=chainMerge -smallClusterHub=kki > chainMerge.log 2>&1 & # real 26m55.624s # forgot the -qRepeats=windowmaskerSdust rm axtChain/ce6.caePb2.net time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF -verbose=2 -bigClusterHub=kk -qRepeats=windowmaskerSdust \ -continue=load -smallClusterHub=kki > load.log 2>&1 & # real 1m18.270s cat fb.ce6.chainCaePb2Link.txt # 40805712 bases of 100281426 (40.691%) in intersection # swap, this is also in caePb2.txt mkdir /cluster/data/caePb2/bed/blastz.ce6.swap cd /cluster/data/caePb2/bed/blastz.ce6.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -qRepeats=windowmaskerSdust \ /cluster/data/ce6/bed/blastzCaePb2.2008-06-04/DEF \ -bigClusterHub=kk -smallClusterHub=kki -swap > swap.log 2>&1 & # a couple of minutes cat fb.caePb2.chainCe6Link.txt # 55092134 bases of 170473138 (32.317%) in intersection ############################################################################ ## BLASTZ caeRem3 (DONE - 2008-06-04,09 - Hiram) ssh kkstore06 screen # use screen to control the job mkdir /cluster/data/ce6/bed/blastzCaeRem3.2008-06-04 cd /cluster/data/ce6/bed/blastzCaeRem3.2008-06-04 cat << '_EOF_' > DEF # ce6 vs caeRem3 BLASTZ_H=2000 BLASTZ_M=50 # TARGET: elegans Ce6 SEQ1_DIR=/scratch/data/ce6/ce6.2bit SEQ1_LEN=/scratch/data/ce6/chrom.sizes SEQ1_CHUNK=1000000 SEQ1_LAP=10000 # QUERY: C. PB2801 caeRem3 SEQ2_DIR=/scratch/data/caeRem3/caeRem3.2bit SEQ2_LEN=/scratch/data/caeRem3/chrom.sizes SEQ2_CTGDIR=/scratch/data/caeRem3/caeRem3.supercontigs.2bit SEQ2_CTGLEN=/scratch/data/caeRem3/caeRem3.supercontigs.sizes SEQ2_LIFT=/scratch/data/caeRem3/caeRem3.chrUn.lift SEQ2_CHUNK=1000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/cluster/data/ce6/bed/blastzCaeRem3.2008-06-04 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF -verbose=2 -bigClusterHub=kk \ -smallClusterHub=kki > do.log 2>&1 & # real 28m40.865s # trouble with chaining due to no copies of genomes on /scratch/data/ # for the Iservers time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF -verbose=2 -bigClusterHub=kk \ -continue=chainMerge -smallClusterHub=kki > chainMerge.log 2>&1 & # real 19m30.231s # forgot the -qRepeats=windowmaskerSdust, went back and fixed that # manually in loadUp.csh and reran the load of the net track cat fb.ce6.chainCaeRem3Link.txt # 41853863 bases of 100281426 (41.736%) in intersection # swap, this is also in caeRem3.txt mkdir /cluster/data/caeRem3/bed/blastz.ce6.swap cd /cluster/data/caeRem3/bed/blastz.ce6.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -qRepeats=windowmaskerSdust \ /cluster/data/ce6/bed/blastzCaeRem3.2008-06-04/DEF \ -bigClusterHub=kk -smallClusterHub=kki -swap > swap.log 2>&1 & cat fb.caeRem3.chainCe6Link.txt # 46326709 bases of 138406388 (33.472%) in intersection # something unusual happened with ssh during installDownloads.csh, # continuing after finishing that manually: time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -continue=cleanup -qRepeats=windowmaskerSdust \ /cluster/data/ce6/bed/blastzCaeRem3.2008-06-04/DEF \ -bigClusterHub=kk -smallClusterHub=kki -swap > cleanUp.log 2>&1 & ############################################################################ ## BLASTZ cb3 (DONE - 2008-06-04,09 - Hiram) ssh kkstore06 screen # use screen to control the job mkdir /cluster/data/ce6/bed/blastzCb3.2008-06-04 cd /cluster/data/ce6/bed/blastzCb3.2008-06-04 cat << '_EOF_' > DEF # ce6 vs cb3 BLASTZ_H=2000 BLASTZ_M=50 # TARGET: elegans Ce6 SEQ1_DIR=/scratch/data/ce6/ce6.2bit SEQ1_LEN=/scratch/data/ce6/chrom.sizes SEQ1_CHUNK=1000000 SEQ1_LAP=10000 # QUERY: C. briggsae cb3 SEQ2_DIR=/san/sanvol1/scratch/worms/cb3/cb3.rmskTrf.2bit SEQ2_LEN=/san/sanvol1/scratch/worms/cb3/chrom.sizes SEQ2_CHUNK=1000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/cluster/data/ce6/bed/blastzCb3.2008-06-04 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF -verbose=2 -bigClusterHub=kk \ -smallClusterHub=kki > do.log 2>&1 & # real 33m24.103s # 42483532 bases of 100281426 (42.364%) in intersection # swap, this is also in cb3.txt mkdir /cluster/data/cb3/bed/blastz.ce6.swap cd /cluster/data/cb3/bed/blastz.ce6.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /cluster/data/ce6/bed/blastzCb3.2008-06-04/DEF \ -bigClusterHub=kk -smallClusterHub=kki -swap > swap.log 2>&1 & # real 3m20.338s # real 22m40.205s cat fb.cb3.chainCe6Link.txt # 43180969 bases of 108433446 (39.823%) in intersection ############################################################################ ## BLASTZ priPac1 (DONE - 2008-06-09 - Hiram) ssh kkstore06 screen # use screen to control the job mkdir /cluster/data/ce6/bed/blastzPriPac1.2008-06-09 cd /cluster/data/ce6/bed/blastzPriPac1.2008-06-09 cat << '_EOF_' > DEF # ce6 vs priPac1 BLASTZ_H=2000 BLASTZ_M=50 # TARGET: elegans Ce6 SEQ1_DIR=/scratch/data/ce6/ce6.2bit SEQ1_LEN=/scratch/data/ce6/chrom.sizes SEQ1_CHUNK=1000000 SEQ1_LAP=10000 # QUERY: C. briggsae priPac1 SEQ2_DIR=/iscratch/i/worms/priPac1/priPac1.TrfWM.2bit SEQ2_LEN=/iscratch/i/worms/priPac1/chrom.sizes SEQ2_CTGDIR=/iscratch/i/worms/priPac1/priPac1.TrfWM.supers.2bit SEQ2_CTGLEN=/iscratch/i/worms/priPac1/priPac1.TrfWM.supers.sizes SEQ2_LIFT=/iscratch/i/worms/priPac1/priPac1.supercontigs.lift SEQ2_CHUNK=1000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/cluster/data/ce6/bed/blastzPriPac1.2008-06-09 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF -verbose=2 -bigClusterHub=kk \ -qRepeats=windowmaskerSdust -smallClusterHub=kki > do.log 2>&1 & # after some kk parasol experiments, continuing: time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -continue=cat `pwd`/DEF -verbose=2 -bigClusterHub=kk \ -qRepeats=windowmaskerSdust -smallClusterHub=kki > cat.log 2>&1 & # real 9m46.300s cat fb.ce6.chainPriPac1Link.txt # 6256254 bases of 100281426 (6.239%) in intersection # swap, this is also in priPac1.txt mkdir /cluster/data/priPac1/bed/blastz.ce6.swap cd /cluster/data/priPac1/bed/blastz.ce6.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -qRepeats=windowmaskerSdust \ /cluster/data/ce6/bed/blastzPriPac1.2008-06-09/DEF \ -bigClusterHub=kk -smallClusterHub=kki -swap > swap.log 2>&1 & # real 0m33.836s # failed during the loadUp.csh due to /iscratch/i/ missing from hgwdev # fixup that manually in loadUp.csh and ran it on hgwdev cat fb.priPac1.chainCe6Link.txt # 6803322 bases of 145948246 (4.661%) in intersection # and continuing: time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -qRepeats=windowmaskerSdust \ -continue=download /cluster/data/ce6/bed/blastzPriPac1.2008-06-09/DEF \ -bigClusterHub=kk -smallClusterHub=kki -swap > download.log 2>&1 & ############################################################################ ## BLASTZ caeJap1 (DONE - 2008-06-09 - Hiram) ssh kkstore06 screen # use screen to control the job mkdir /cluster/data/ce6/bed/blastzCaeJap1.2008-06-09 cd /cluster/data/ce6/bed/blastzCaeJap1.2008-06-09 cat << '_EOF_' > DEF # ce6 vs caeJap1 BLASTZ_H=2000 BLASTZ_M=50 # TARGET: elegans Ce6 SEQ1_DIR=/scratch/data/ce6/ce6.2bit SEQ1_LEN=/scratch/data/ce6/chrom.sizes SEQ1_CHUNK=1000000 SEQ1_LAP=10000 # QUERY: C. japonica caeJap1 SEQ2_DIR=/cluster/bluearc/scratch/data/caeJap1/caeJap1.2bit SEQ2_LEN=/cluster/bluearc/scratch/data/caeJap1/chrom.sizes SEQ2_CTGDIR=/cluster/bluearc/scratch/data/caeJap1/caeJap1.TrfWM.supers.2bit SEQ2_CTGLEN=/cluster/bluearc/scratch/data/caeJap1/caeJap1.TrfWM.supers.sizes SEQ2_LIFT=/cluster/bluearc/scratch/data/caeJap1/caeJap1.chrUn.lift SEQ2_CHUNK=1000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/cluster/data/ce6/bed/blastzCaeJap1.2008-06-09 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF -verbose=2 -bigClusterHub=pk \ -qRepeats=windowmaskerSdust -smallClusterHub=kki > do.log 2>&1 & # real 35m7.220s cat fb.ce6.chainCaeJap1Link.txt # 26888483 bases of 100281426 (26.813%) in intersection # swap, this is also in caeJap1.txt mkdir /cluster/data/caeJap1/bed/blastz.ce6.swap cd /cluster/data/caeJap1/bed/blastz.ce6.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -qRepeats=windowmaskerSdust \ /cluster/data/ce6/bed/blastzCaeJap1.2008-06-09/DEF \ -bigClusterHub=pk -smallClusterHub=kki -swap > swap.log 2>&1 & # real 1m10.794s # failed to load because: # loadUp: looks like previous stage was not # successful # (can't find /cluster/data/caeJap1/bed/blastz.ce6.swap/mafNet/). # continuing: time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -qRepeats=windowmaskerSdust \ -continue=load /cluster/data/ce6/bed/blastzCaeJap1.2008-06-09/DEF \ -bigClusterHub=pk -smallClusterHub=kki -swap > load.log 2>&1 & # real 0m33.440s cat fb.caeJap1.chainCe6Link.txt # 26209442 bases of 129347181 (20.263%) in intersection ############################################################################ ## 6-Way multiple alignment (DONE - 2008-06-09 - Hiram) mkdir /cluster/data/ce6/bed/multiz6way cd /cluster/data/ce6/bed/multiz6way # An additional nematode tree: # http://nematol.unh.edu/tree/tree_des.php # Constructing a tree from the document: # http://www.wormbook.org/chapters/www_phylogrhabditids/phylorhab.html # http://www.wormbook.org/chapters/www_phylogrhabditids/phylofig6B.jpg # purely artifical for the last one: :0.2,pristiochus_priPac1:0.8 # since it is not on the graph cat << '_EOF_' > nematodes.nh ((((((((((briggsae_cb3:0.005,remanei_caeRem3:0.003):0.004, brenneri_caePb2:0.013):0.002,elegans_ce6:0.003):0.001, japonica_caeJap1:0.023):0.024,ps1010:0.027):0.014, df5070:0.056):0.009, plicata:0.102):0.099,sb341:0.060):0.038, (df5074:0.351,df5055:0.090):0.221):0.2,pristiochus_priPac1:0.8) '_EOF_' # << happy emacs # filter that for the six species in use here /cluster/bin/phast/x86_64/tree_doctor \ --prune-all-but briggsae_cb3,remanei_caeRem3,brenneri_caePb2,elegans_ce6,japonica_caeJap1,pristiochus_priPac1 \ nematodes.nh > 6way.nh # rearrange to get Ce6 on top for the construction of the tree image # in the 6-way description page. This results in something like: (((C._elegans_ce6:0.003000, (C._brenneri_caePb2:0.013000, (C._remanei_caeRem3:0.003000,C._briggsae_cb3:0.005000):0.004000) :0.002000):0.001000, C._japonica_caeJap1:0.023000):0.384000, P._pristiochus_priPac1:0.800000); /cluster/bin/phast/x86_64/all_dists 6way.nh > 6way.distances.txt grep -i ce6 6way.distances.txt | sort -k3,3n # Use this output for reference, and use the calculated # distances in the table below to order the organisms and check # the button order on the browser. # And if you can fill in the table below entirely, you have # succeeded in finishing all the alignments required. # # featureBits chainLink measures # chaince6Link chain linearGap # distance on Ce6 on other minScore # 1 0.0120 - remanei_caeRem3 (% 41.736) (% 33.472) 1000 loose # 2 0.0140 - briggsae_cb3 (% 42.364) (% 39.823) 1000 loose # 3 0.0180 - brenneri_caePb2 (% 40.691) (% 32.317) 1000 loose # 3 0.0270 - japonica_caeJap1 (% 26.813) (% 20.263) 1000 loose # 4 1.1880 - pristiochus_priPac1 (% 6.239) (% 4.661) 1000 loose cd /cluster/data/ce6/bed/multiz6way # bash shell syntax here ... export H=/cluster/data/ce6/bed mkdir mafLinks for G in caeRem3 cb3 caePb2 priPac1 caeJap1 do mkdir mafLinks/$G if [ ! -d ${H}/blastz.${G}/mafNet ]; then echo "missing directory blastz.${G}/mafNet" exit 255 fi ln -s ${H}/blastz.$G/mafNet/*.maf.gz ./mafLinks/$G done # Copy MAFs to some appropriate NFS server for kluster run ssh kkstore06 mkdir /san/sanvol1/scratch/worms/ce6/multiz6way cd /san/sanvol1/scratch/worms/ce6/multiz6way time rsync -a --copy-links --progress --stats \ /cluster/data/ce6/bed/multiz6way/mafLinks/ . # a few seconds to copy 126 Mb # these are x86_64 binaries mkdir penn # use latest penn utilities P=/cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba cp -p $P/{autoMZ,multiz,maf_project} penn # the autoMultiz cluster run ssh memk cd /cluster/data/ce6/bed/multiz6way/ # create species list and stripped down tree for autoMZ sed 's/[a-z][a-z0-9]*_//ig; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \ 6way.nh > tmp.nh echo `cat tmp.nh` > tree-commas.nh echo `cat tree-commas.nh` | sed 's/ //g; s/,/ /g' > tree.nh sed 's/[()]//g; s/,/ /g' tree.nh > species.lst mkdir maf run cd run # NOTE: you need to set the db properly in this script cat > autoMultiz << '_EOF_' #!/bin/csh -ef set db = ce6 set c = $1 set maf = $2 set binDir = /san/sanvol1/scratch/worms/$db/multiz6way/penn set tmp = /scratch/tmp/$db/multiz.$c set pairs = /san/sanvol1/scratch/worms/$db/multiz6way rm -fr $tmp mkdir -p $tmp cp ../{tree.nh,species.lst} $tmp pushd $tmp foreach s (`cat species.lst`) set in = $pairs/$s/$c.maf set out = $db.$s.sing.maf if ($s == $db) then continue endif if (-e $in.gz) then zcat $in.gz > $out else if (-e $in) then cp $in $out else echo "##maf version=1 scoring=autoMZ" > $out endif end set path = ($binDir $path); rehash $binDir/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf popd cp $tmp/$c.maf $maf rm -fr $tmp '_EOF_' # << happy emacs chmod +x autoMultiz cat << '_EOF_' > template #LOOP autoMultiz $(root1) {check out line+ /cluster/data/ce6/bed/multiz6way/maf/$(root1).maf} #ENDLOOP '_EOF_' # << happy emacs awk '{print $1}' /cluster/data/ce6/chrom.sizes > chrom.lst gensub2 chrom.lst single template jobList para create jobList para -maxNode=1 push para check ... push ... etc ... # Completed: 7 of 7 jobs # CPU time in finished jobs: 2302s 38.37m 0.64h 0.03d 0.000 y # IO & Wait Time: 38s 0.63m 0.01h 0.00d 0.000 y # Average job time: 334s 5.57m 0.09h 0.00d # Longest running job: 0s 0.00m 0.00h 0.00d # Longest finished job: 478s 7.97m 0.13h 0.01d # Submission to last job: 954s 15.90m 0.27h 0.01d ssh kkstore06 cd /cluster/data/ce6/bed/multiz6way time nice -n +19 catDir maf > multiz6way.maf # a few seconds to produce: # -rw-rw-r-- 1 343585668 Jun 9 16:25 multiz6way.maf # before getting to the annotation, load this up so we can get # a first view of the track. This will be replaced with the annotated # mafs ssh hgwdev cd /cluster/data/ce6/bed/multiz6way mkdir /gbdb/ce6/multiz6way ln -s /cluster/data/ce6/bed/multiz6way/multiz6way.maf /gbdb/ce6/multiz6way # this load creates a large file, do that on local disk: cd /scratch/tmp time nice -n +19 hgLoadMaf ce6 multiz6way # a few seconds to load: # Loaded 367046 mafs in 1 files from /gbdb/ce6/multiz6way time nice -n +19 hgLoadMafSummary -minSize=10000 -mergeGap=500 \ -maxSize=50000 ce6 multiz6waySummary /gbdb/ce6/multiz6way/multiz6way.maf # a few seconds to load: # Created 125113 summary blocks from 1051000 components # and 366975 mafs from /gbdb/ce6/multiz6way/multiz6way.maf # remove the temporary .tab files: rm multiz6*.tab ############################################################################ #### Blat sangerGene proteins to determine exons # (DONE - 2008-06-11 - hiram) ssh hgwdev cd /cluster/data/ce6/bed mkdir blat.ce6SG.2008-06-11 ln -s blat.ce6SG.2008-06-11 blat.ce6SG cd blat.ce6SG pepPredToFa ce6 sangerPep sangerPep.fa ssh kkstore06 # create san nib directory from goldenPath chromosomes fa files mkdir /san/sanvol1/scratch/worms/ce6/nib cd /san/sanvol1/scratch/worms/ce6/nib for C in I II III IV V X M do twoBitToFa -seq=chr${C} /cluster/data/ce6/ce6.2bit stdout \ | faToNib -softMask stdin chr${C}.nib done # The kluster run ssh pk cd /cluster/data/ce6/bed/blat.ce6SG cat << '_EOF_' > blatSome #!/bin/csh -fe blat -t=dnax -q=prot -out=pslx /san/sanvol1/scratch/worms/ce6/nib/$1.nib sgfa/$2.fa $3 '_EOF_' # << happy emacs chmod +x blatSome ls -1S /san/sanvol1/scratch/worms/ce6/nib > ce6.lst mkdir sgfa cd sgfa faSplit sequence ../sangerPep.fa 3000 sg ls -1S *.fa > ../sg.lst cd .. cat << '_EOF_' > template #LOOP blatSome $(root1) $(root2) {check out line psl/$(root1)/$(root2).psl} #ENDLOOP '_EOF_' # << happy emacs gensub2 ce6.lst sg.lst template jobList mkdir psl cd psl sed -e "s/.nib//" ../ce6.lst | xargs mkdir cd .. para create jobList para try ... check ... push ... etc # Completed: 20335 of 20335 jobs # CPU time in finished jobs: 69885s 1164.74m 19.41h 0.81d 0.002 y # IO & Wait Time: 51640s 860.67m 14.34h 0.60d 0.002 y # Average job time: 6s 0.10m 0.00h 0.00d # Longest finished job: 348s 5.80m 0.10h 0.00d # Submission to last job: 943s 15.72m 0.26h 0.01d ssh kkstore06 cd /cluster/data/ce6/bed/blat.ce6SG.2008-06-11 pslSort dirs raw.psl /tmp psl/* # -rw-rw-r-- 1 70627970 Jun 11 10:35 raw.psl pslReps -nohead -minCover=0.9 -minAli=0.9 raw.psl cooked.psl /dev/null # Processed 155625 alignments # -rw-rw-r-- 1 26423327 Jun 11 10:35 cooked.psl pslUniq cooked.psl ce6SG.psl # -rw-rw-r-- 1 25332830 Jun 11 10:36 ce6SG.psl cut -f 10 ce6SG.psl > sgName.lst faSomeRecords sangerPep.fa sgName.lst ce6SG.fa # -rw-rw-r-- 1 10903181 Jun 11 10:11 sangerPep.fa # -rw-rw-r-- 1 10899914 Jun 11 10:37 ce6SG.fa grep "^>" ce6SG.fa | wc -l # 23741 grep "^>" sangerPep.fa | wc -l # 23771 pslxToFa ce6SG.psl ce6SG_ex.fa -liftTarget=genome.lft \ -liftQuery=protein.lft # -rw-rw-r-- 1 5288587 Jun 11 10:39 protein.lft # -rw-rw-r-- 1 6741328 Jun 11 10:39 genome.lft # -rw-rw-r-- 1 12743074 Jun 11 10:39 ce6SG_ex.fa wc -l *.psl *.lft *.fa sgName.lst # 23741 ce6SG.psl # 25926 cooked.psl # 155630 raw.psl # 151196 genome.lft # 151196 protein.lft # 244239 ce6SG.fa # 302392 ce6SG_ex.fa # 244343 sangerPep.fa # 23741 sgName.lst # 1322404 total # back on hgwdev ssh hgwdev cd /cluster/data/ce6/bed/blat.ce6SG # After about an hour, it exited with this message: # sqlFreeConnection called on cache (ce6) that doesn't contain # the given connection # This may be a lurking error in this program, because the # resulting file seems to have the correct number of lines: # Once upon a time, long long ago, this table was made by the kgName # command, but this doesn't work on C. elegans because there is no # kgXref table. So, the following perl script was developed to do the # same thing # kgName ce6 ce6SG.psl blastSGRef01 # Need a reference table loaded, perl script makes up relationships cat << '_EOF_' >> mkRef.pl #!/usr/bin/env perl use warnings; use strict; my $argc = scalar(@ARGV); sub usage() { print STDERR "usage: ./mkRef.pl blastSGRef01.tab\n"; print STDERR "reads ../sangerGene/genePred.tab and ", "../sangerGene/orfToGene.tab\n"; print STDERR "writes references to the given input file, ", "e.g. blastSGRef01.tab\n"; } if (1 != $argc) { usage; exit 255; } my $outFile = shift; my %orfToGene; # key is all lowercase orf name, value is gene name my $inFile="../sangerGene/orfToGene.tab"; open (FH,"<$inFile") or die "can not read $inFile"; while (my $line=) { chomp $line; my ($orf, $gene) = split('\s+',$line); $orfToGene{lc($orf)} = $gene } close (FH); my %links; # key is all lowercase orf name, value is SwissProt name $inFile="../sangerGene/sangerLinks.tab"; open (FH,"<$inFile") or die "can not read $inFile"; while (my $line=) { chomp $line; my ($orf, $SP, $comments) = split('\s+',$line); $links{lc($orf)} = $SP } close (FH); my %ce6Position; # key is all lowercase orf name, value is ce6 position $inFile="../sangerGene/genePred.tab"; open (FH,"<$inFile") or die "can not read $inFile"; while (my $line=) { chomp $line; my ($bin, $orf, $chr, $strand, $start, $end, $other) = split('\s+',$line); $ce6Position{lc($orf)} = sprintf("%s:%d-%d", $chr, $start, $end); } close (FH); open (OF,">$outFile") or die "can not write to $outFile"; $inFile="sgName.lst"; open (FH,"<$inFile") or die "can not read $inFile"; while (my $orf=) { chomp $orf; if (!exists($orfToGene{lc($orf)})) { $orfToGene{lc($orf)} = ""; } if (!exists($links{lc($orf)})) { $links{lc($orf)} = ""; } if (!exists($ce6Position{lc($orf)})) { $ce6Position{lc($orf)} = ""; } printf OF "%s\t%s\t%s\t%s\t%s\n", $orf, $orfToGene{lc($orf)}, $ce6Position{lc($orf)}, $links{lc($orf)}, ""; } close (FH); close (OF); '_EOF_' # << happy emacs chmod +x ./mkRef.pl ./mkRef.pl blastSGRef01.tab wc -l blastSGRef01.tab # 23741 blastSGRef.01.tab hgLoadSqlTab ce6 blastSGRef01 ~/kent/src/hg/lib/blastRef.sql \ blastSGRef01.tab wc -l sgName.lst blastSGRef01.tab ce6SG.psl # 23741 sgName.lst # 23741 blastSGRef01.tab # 23741 ce6SG.psl # And load the protein sequences hgPepPred ce6 generic blastSGPep01 ce6SG.fa ############################################################################ # CREATE CONSERVATION WIGGLE WITH PHASTCONS - 6-WAY # (DONE - 2008-06-18 - Hiram) # Estimate phastCons parameters ssh kkstore06 # We need the fasta sequence for this business cd /cluster/data/ce6 for C in `cut -f1 chrom.sizes | sed -e s/chr//` do echo "twoBitToFa -seq=chr${C} ce6.2bit ${C}/chr${C}.fa" twoBitToFa -seq=chr${C} ce6.2bit ${C}/chr${C}.fa done mkdir /cluster/data/ce6/bed/multiz6way/cons cd /cluster/data/ce6/bed/multiz6way/cons # Create a starting-tree.mod based on chrV (the largest one) # Set -windows large enough so it only makes one piece faSize ../../../V/chrV.fa # 20919568 bases (0 N's 20919568 real 18018198 upper 2901370 lower) # in 1 sequences in 1 files # %13.87 masked total, %13.87 masked real time nice -n +19 /cluster/bin/phast/$MACHTYPE/msa_split ../maf/chrV.maf \ --refseq ../../../V/chrV.fa --in-format MAF \ --windows 30000000,1000 --out-format SS \ --between-blocks 5000 --out-root s1 # Creating partition 1 (column 1 to column 20919568)... # Writing partition 1 to s1.1-20919568.ss... # 13 seconds # the tree in use cat ../tree-commas.nh # (((((cb3,caeRem3),caePb2),ce6),caeJap1),priPac1) # this s1.*.ss should match only one item time nice -n +19 /cluster/bin/phast/$MACHTYPE/phyloFit -i SS s1.*.ss \ --tree "(((((cb3,caeRem3),caePb2),ce6),caeJap1),priPac1)" \ --out-root starting-tree # real 0m14.408s rm s1.*.ss # add up the C and G: grep BACKGROUND starting-tree.mod | awk '{printf "%0.3f\n", $3 + $4;}' # 0.383 # This 0.383 is used in the --gc argument below # Create SS files on san filesystem # Increasing their size this time from 1,000,000 to 4,000,000 to # slow down the phastCons pk jobs ssh kkstore06 mkdir -p /san/sanvol1/scratch/worms/ce6/cons/ss cd /san/sanvol1/scratch/worms/ce6/cons/ss time for C in `cut -f1 /cluster/data/ce6/chrom.sizes` do if [ -s /cluster/data/ce6/bed/multiz6way/maf/${C}.maf ]; then mkdir ${C} echo msa_split $C chrN=${C/chr/} chrN=${chrN/_random/} /cluster/bin/phast/$MACHTYPE/msa_split \ /cluster/data/ce6/bed/multiz6way/maf/${C}.maf \ --refseq /cluster/data/ce6/${chrN}/${C}.fa \ --in-format MAF --windows 4000000,0 --between-blocks 5000 \ --out-format SS --out-root ${C}/${C} fi done # real 1m7.685s # Create a random list of 50 1 mb regions (do not use the _randoms) cd /san/sanvol1/scratch/worms/ce6/cons/ss ls -1l chr*/chr*.ss | grep -v random | \ awk '$5 > 4000000 {print $9;}' | randomLines stdin 50 ../randomSs.list # Only 28 lines in stdin (i.e. all are used) # Set up parasol directory to calculate trees on these regions ssh pk mkdir /san/sanvol1/scratch/worms/ce6/cons/treeRun2 cd /san/sanvol1/scratch/worms/ce6/cons/treeRun2 mkdir tree log # Tuning this loop should come back to here to recalculate # Create script that calls phastCons with right arguments # expected-lengths and target-coverage taken from ce4 history # for treeRun2 --expected-lengths 15 --target-coverage 0.55 # for treeRun1 --expected-lengths 15 --target-coverage 0.60 cat > makeTree.csh << '_EOF_' #!/bin/csh -fe set C=$1:h mkdir -p log/${C} tree/${C} /cluster/bin/phast/$MACHTYPE/phastCons ../ss/$1 \ /cluster/data/ce6/bed/multiz6way/cons/starting-tree.mod \ --gc 0.383 --nrates 1,1 --no-post-probs --ignore-missing \ --expected-lengths 15 --target-coverage 0.55 \ --quiet --log log/$1 --estimate-trees tree/$1 '_EOF_' # << happy emacs chmod +x makeTree.csh cat > template << '_EOF_' #LOOP makeTree.csh $(path1) #ENDLOOP '_EOF_' # << happy emacs # Make cluster job and run it gensub2 ../randomSs.list single template jobList para create jobList para try/push/check/etc # Completed: 28 of 28 jobs # CPU time in finished jobs: 5941s 99.01m 1.65h 0.07d 0.000 y # IO & Wait Time: 64s 1.07m 0.02h 0.00d 0.000 y # Average job time: 214s 3.57m 0.06h 0.00d # Longest finished job: 309s 5.15m 0.09h 0.00d # Submission to last job: 326s 5.43m 0.09h 0.00d # Now combine parameter estimates. We can average the .mod files # using phyloBoot. This must be done separately for the conserved # and nonconserved models ssh kkstore06 cd /san/sanvol1/scratch/worms/ce6/cons/treeRun2 ls -1 tree/chr*/*.cons.mod > cons.list /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*cons.list' \ --output-average ../ave.cons.mod > cons_summary.txt 2>&1 ls -1 tree/chr*/*.noncons.mod > noncons.list /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*noncons.list' \ --output-average ../ave.noncons.mod > noncons_summary.txt cd .. cp -p ave.*.mod /cluster/data/ce6/bed/multiz6way/cons # measuring entropy # consEntopy # ave.cons.mod ave.noncons.mod --NH 9.78 /cluster/bin/phast/$MACHTYPE/consEntropy --NH 9.78 .55 15 \ ave.cons.mod ave.noncons.mod ### for treeRun2: --expected-lengths 15 --target-coverage 0.55 # ( Solving for new omega: 15.000000 76.253180 42.469155 35.389619 # 34.810297 34.805819 ) # Transition parameters: gamma=0.550000, omega=15.000000, mu=0.066667, # nu=0.081481 # Relative entropy: H=1.172307 bits/site # Expected min. length: L_min=6.108564 sites # Expected max. length: L_max=5.516260 sites # Phylogenetic information threshold: PIT=L_min*H=7.161112 bits # Recommended expected length: omega=34.805819 sites (for L_min*H=9.780000) ### for treeRun1: --expected-lengths 15 --target-coverage 0.60 # ( Solving for new omega: 15.000000 140.706073 61.140093 42.355284 # 39.561113 39.474822 ) # Transition parameters: gamma=0.600000, omega=15.000000, mu=0.066667, # nu=0.100000 # Relative entropy: H=1.248356 bits/site # Expected min. length: L_min=5.363742 sites # Expected max. length: L_max=5.097039 sites # Phylogenetic information threshold: PIT=L_min*H=6.695858 bits # Recommended expected length: omega=39.474822 sites (for L_min*H=9.780000) ssh pk # Create cluster dir to do main phastCons run mkdir /san/sanvol1/scratch/worms/ce6/cons/consRun2 cd /san/sanvol1/scratch/worms/ce6/cons/consRun2 mkdir ppRaw bed # using the ave.cons.mod and ave.noncons.mod from treeRun2 above: ALPHABET: A C G T ORDER: 0 SUBST_MOD: REV BACKGROUND: 0.308500 0.191500 0.191500 0.308500 RATE_MAT: -0.875904 0.175256 0.458101 0.242547 0.282331 -1.200805 0.180928 0.737546 0.737985 0.180928 -1.200354 0.281440 0.242547 0.457829 0.174703 -0.875079 TREE: (((((cb3:0.087550,caeRem3:0.073515):0.020860,caePb2:0.090766):0.029808,ce6:0.094770):0.038388,caeJap1:0.109779):0.126548,priPac1:0.126548); ALPHABET: A C G T ORDER: 0 SUBST_MOD: REV BACKGROUND: 0.308500 0.191500 0.191500 0.308500 RATE_MAT: -0.875904 0.175256 0.458101 0.242547 0.282331 -1.200805 0.180928 0.737546 0.737985 0.180928 -1.200354 0.281440 0.242547 0.457829 0.174703 -0.875079 TREE: (((((cb3:0.328951,caeRem3:0.273909):0.078262,caePb2:0.339349):0.111927,ce6:0.355503):0.144928,caeJap1:0.413391):0.478831,priPac1:0.478831); # Create script to run phastCons with right parameters # This job is I/O intensive in its output files, thus it is all # working over in /scratch/tmp/ cat > doPhast << '_EOF_' #!/bin/csh -fe mkdir /scratch/tmp/${2} cp -p ../ss/${1}/${2}.ss ../ave.cons.mod ../ave.noncons.mod /scratch/tmp/${2} pushd /scratch/tmp/${2} > /dev/null /cluster/bin/phast/${MACHTYPE}/phastCons ${2}.ss ave.cons.mod,ave.noncons.mod \ --expected-lengths 15 --target-coverage 0.55 --quiet \ --seqname ${1} --idpref ${1} --viterbi ${2}.bed --score > ${2}.pp popd > /dev/null mkdir -p ppRaw/${1} mkdir -p bed/${1} mv /scratch/tmp/${2}/${2}.pp ppRaw/${1} mv /scratch/tmp/${2}/${2}.bed bed/${1} rm /scratch/tmp/${2}/ave.{cons,noncons}.mod rm /scratch/tmp/${2}/${2}.ss rmdir /scratch/tmp/${2} '_EOF_' # << happy emacs chmod +x doPhast # root1 == chrom name, file1 == ss file name without .ss suffix cat > template << '_EOF_' #LOOP doPhast $(root1) $(file1) #ENDLOOP '_EOF_' # << happy emacs # Create parasol batch and run it ls -1 ../ss/chr*/chr*.ss | sed 's/.ss$//' > in.list gensub2 in.list single template jobList para create jobList para try/check/push/etc. # These jobs are very fast and very I/O intensive # Completed: 29 of 29 jobs # CPU time in finished jobs: 386s 6.43m 0.11h 0.00d 0.000 y # IO & Wait Time: 85s 1.42m 0.02h 0.00d 0.000 y # Average job time: 16s 0.27m 0.00h 0.00d # Longest finished job: 22s 0.37m 0.01h 0.00d # Submission to last job: 49s 0.82m 0.01h 0.00d # combine predictions and transform scores to be in 0-1000 interval # it uses a lot of memory, so on kolossus: ssh kolossus cd /san/sanvol1/scratch/worms/ce6/cons/consRun2 # The sed's and the sort get the file names in chrom,start order # You might like to verify it is correct by first looking at the # list it produces: find ./bed -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \ | sort -k7,7 -k9,9n \ | sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" # if that looks right (== in chrom and numerical order), then let it run: find ./bed -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \ | sort -k7,7 -k9,9n \ | sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \ | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \ | /cluster/bin/scripts/lodToBedScore /dev/stdin \ > phastConsElements6way.bed # way too many negative # scores with --expected-length 15 --target-coverage 0.60 # OK with--expected-length 15 --target-coverage 0.55 cp -p phastConsElements6way.bed /cluster/data/ce6/bed/multiz6way # Figure out how much is actually covered by the bed file as so: # Get the non-n genome size from faSize on all chroms: ssh kkstore06 cd /cluster/data/ce6 faSize */chr*.fa # 100281426 bases (0 N's 100281426 real 86981803 upper # 13299623 lower) in 7 sequences in 7 files # %13.26 masked total, %13.26 masked real # 100281244 bases (0 N's 100281244 real 87008909 upper # 13272335 lower) in 7 sequences in 7 files # %13.24 masked total, %13.24 masked real cd /san/sanvol1/scratch/worms/ce6/cons/consRun2 # The 100281426 comes from the non-n genome as counted above. awk ' {sum+=$3-$2} END{printf "%% %.2f = 100.0*%d/100281426\n",100.0*sum/100281426,sum}' \ phastConsElements6way.bed # --expected-length 15 --target-coverage 0.55 # % 29.56 = 100.0*29639749/100281426 # Aiming for %70 coverage in # the following featureBits measurement on CDS: # Beware of negative scores when too high. The logToBedScore # will output an error on any negative scores. HGDB_CONF=~/.hg.conf.read-only time nice -n +19 featureBits ce6 \ -enrichment sangerGene:cds phastConsElements6way.bed # --expected-length 15 --target-coverage 0.55 # sangerGene:cds 25.329%, phastConsElements6way.bed 29.557%, # both 15.469%, cover 61.07%, enrich 2.07x # in Ce4, this was: # sangerGene:cds 25.286%, phastConsElements5way.bed 32.368%, # both 15.733%, cover 62.22%, enrich 1.92x # Load most conserved track into database ssh hgwdev cd /cluster/data/ce6/bed/multiz6way # the copy was already done above # cp -p /san/sanvol1/scratch/worms/ce6/cons/consRun2/phastConsElements6way.bed . time nice -n +19 hgLoadBed ce6 phastConsElements6way \ phastConsElements6way.bed # Loaded 595094 elements of size 5 # should measure the same as above time nice -n +19 featureBits ce6 -enrichment sangerGene:cds \ phastConsElements6way # --expected-length 15 --target-coverage 0.55 # sangerGene:cds 25.329%, phastConsElements6way 29.557%, both 15.469%, # cover 61.07%, enrich 2.07x # on Ce4, this was: time nice -n +19 featureBits ce4 -enrichment sangerGene:cds \ phastConsElements5way # sangerGene:cds 25.286%, phastConsElements5way 32.368%, both 15.733%, # cover 62.22%, enrich 1.92x # Create merged posterier probability file and wiggle track data files ssh kkstore06 cd /san/sanvol1/scratch/worms/ce6/cons/consRun2 # prepare compressed copy of ascii data values for downloads ssh pk cd /san/sanvol1/scratch/worms/ce6/cons/consRun2 cat << '_EOF_' > gzipAscii.sh #!/bin/sh TOP=`pwd` export TOP mkdir -p phastCons6wayScores for D in ppRaw/chr* do C=${D/ppRaw\/} out=phastCons6wayScores/${C}.data.gz echo "========================== ${C} ${D}" find ./${D} -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \ | sort -k7,7 -k9,9n \ | sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat | gzip > ${out} done '_EOF_' # << happy emacs chmod +x gzipAscii.sh time nice -n +19 ./gzipAscii.sh # real 1m14.178s # use those file to encode the data for the track: for C in I II III IV V X M do zcat phastCons6wayScores/chr${C}.data.gz done | wigEncode stdin phastCons6way.wig phastCons6way.wib # Converted stdin, upper limit 1.00, lower limit 0.00 # -rw-rw-r-- 1 56813758 Jun 18 12:05 phastCons6way.wib # -rw-rw-r-- 1 8880383 Jun 18 12:05 phastCons6way.wig nice -n +19 cp -p phastCons6way.wi? /cluster/data/ce6/bed/multiz6way/ # copy them for downloads ssh kkstore06 mkdir /cluster/data/ce6/bed/multiz6way/phastCons6wayScores cd /cluster/data/ce6/bed/multiz6way/phastCons6wayScores cp -p /san/sanvol1/scratch/worms/ce6/cons/consRun2/phastCons6wayScores/* . ssh hgwdev cd /usr/local/apache/htdocs/goldenPath/ce6 ln -s /cluster/data/ce6/bed/multiz6way/phastCons6wayScores . # Load gbdb and database with wiggle. ssh hgwdev cd /cluster/data/ce6/bed/multiz6way ln -s `pwd`/phastCons6way.wib /gbdb/ce6/wib/phastCons6way.wib nice -n +19 hgLoadWiggle ce6 phastCons6way phastCons6way.wig # Create histogram to get an overview of all the data ssh hgwdev cd /cluster/data/ce6/bed/multiz6way nice -n +19 hgWiggle -doHistogram \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ -db=ce6 phastCons6way > histogram.data 2>&1 # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small color \ x000000 xffffff xc000ff x66ff66 xffff00 x00ffff xff0000 set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Worm Ce6 Histogram phastCons6way track" set xlabel " phastCons6way score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & # enable this line in the ce6/trackDb.ra multiz6way track definition: # wiggle phastCons6way # to get the graph to display in the genome browser ############################################################################ # ANNOTATE MULTIZ6WAY MAF AND LOAD TABLES (DONE - 2008-06-19 - Hiram) ssh kolossus mkdir /cluster/data/ce6/bed/multiz6way/anno cd /cluster/data/ce6/bed/multiz6way/anno # these actually already all exist from previous multiple alignments # if there are done you will see an ls of them # or else the twoBitInfo command will be echoed, run it if you want for DB in `cat ../species.lst` do CDIR="/cluster/data/${DB}" if [ ! -f ${CDIR}/${DB}.N.bed ]; then echo "creating ${DB}.N.bed" echo twoBitInfo -nBed ${CDIR}/${DB}.2bit ${CDIR}/${DB}.N.bed else ls -og ${CDIR}/${DB}.N.bed fi done mkdir maf run cd run rm -f sizes nBeds for DB in `sed -e "s/ce6 //" ../../species.lst` do echo "${DB} " ln -s /cluster/data/${DB}/${DB}.N.bed ${DB}.bed echo ${DB}.bed >> nBeds ln -s /cluster/data/${DB}/chrom.sizes ${DB}.len echo ${DB}.len >> sizes done cat << '_EOF_' > doAnno.csh #!/bin/csh -ef set dir = /cluster/data/ce6/bed/multiz6way set c = $1 cat $dir/maf/${c}.maf | nice \ mafAddIRows -nBeds=nBeds stdin /cluster/data/ce6/ce6.2bit $2 '_EOF_' # << happy emacs chmod +x doAnno.csh cat << '_EOF_' > template #LOOP ./doAnno.csh $(root1) {check out line+ ../maf/$(root1).maf} #ENDLOOP '_EOF_' # << happy emacs cut -f1 /cluster/data/ce6/chrom.sizes > chrom.list gensub2 chrom.list single template jobList # Completed: 7 of 7 jobs # CPU time in finished jobs: 217s 3.61m 0.06h 0.00d 0.000 y # IO & Wait Time: 60s 1.01m 0.02h 0.00d 0.000 y # Average job time: 40s 0.66m 0.01h 0.00d # Longest finished job: 55s 0.92m 0.02h 0.00d # Submission to last job: 58s 0.97m 0.02h 0.00d # Load anno/maf ssh hgwdev cd /cluster/data/ce6/bed/multiz6way/anno/maf mkdir -p /gbdb/ce6/multiz6way/anno/maf ln -s /cluster/data/ce6/bed/multiz6way/anno/maf/*.maf \ /gbdb/ce6/multiz6way/anno/maf time nice -n +19 /cluster/bin/x86_64/hgLoadMaf \ -pathPrefix=/gbdb/ce6/multiz6way/anno/maf ce6 multiz6way # XXX problem: Can't hSetDb(ce6) after an hAllocConn(hg18), sorry. # Loaded 420764 mafs in 7 files from /gbdb/ce6/multiz6way/anno/maf # real 0m9.567s # Do the computation-intensive part of hgLoadMafSummary on a workhorse # machine and then load on hgwdev: ssh kolossus cd /cluster/data/ce6/bed/multiz6way/anno/maf time cat *.maf | \ nice -n +19 hgLoadMafSummary ce6 -minSize=30000 -mergeGap=1500 \ -maxSize=200000 -test multiz6waySummary stdin # Created 45771 summary blocks from 1051000 components # and 420693 mafs from stdin # real 0m16.138s # -rw-rw-r-- 1 2111948 Jun 20 10:25 multiz6waySummary.tab ssh hgwdev cd /cluster/data/ce6/bed/multiz6way/anno/maf time nice -n +19 hgLoadSqlTab ce6 multiz6waySummary \ ~/kent/src/hg/lib/mafSummary.sql multiz6waySummary.tab # real 0m0.645 # Create per-chrom individual maf files for downloads ssh kkstore02 cd /cluster/data/ce6/bed/multiz6way mkdir mafDownloads time for M in anno/maf/chr*.maf do B=`basename $M` nice -n +19 cp -p ${M} mafDownloads/${B} nice -n +19 gzip mafDownloads/${B} echo ${B} done done ssh hgwdev cd /cluster/data/ce6/bed/multiz6way # redone 2007-12-21 to fix difficulty in mafFrags when ce6 was not the # first species listed # upstream mafs # There isn't any refGene table, trying sangerGene instead #!/bin/sh for S in 1000 2000 5000 do echo "making upstream${S}.maf" nice -n +19 $HOME/bin/$MACHTYPE/featureBits -verbose=2 ce6 \ sangerGene:upstream:${S} -fa=/dev/null -bed=stdout \ | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \ | $HOME/kent/src/hg/ratStuff/mafFrags/mafFrags ce6 multiz6way \ stdin stdout -orgs=species.lst \ | gzip -c > mafDownloads/sangerGene.upstream${S}.maf.gz echo "done sangerGene.upstream${S}.maf.gz" done cd mafDownloads md5sum *.gz > md5sum.txt # deliver to downloads ssh hgwdev mkdir /usr/local/apache/htdocs/goldenPath/ce6/multiz6way cd /usr/local/apache/htdocs/goldenPath/ce6/multiz6way ln -s /cluster/data/ce6/bed/multiz6way/mafDownloads/* . ####################################################################### # MULTIZ5WAY MAF FRAMES (DONE - 2007-05-03 - Hiram) ssh hgwdev mkdir /cluster/data/ce6/bed/multiz6way/frames cd /cluster/data/ce6/bed/multiz6way/frames mkdir genes # The following is adapted from the ce4 5-way sequence #------------------------------------------------------------------------ # get the genes for all genomes # using sangerGene for ce6 # using blastCe6SG for cb3, caePb2, caeRem3, caeJap1 priPac1 for qDB in ce6 cb3 caePb2 caeRem3 caeJap1 priPac1 do if [ $qDB = "cb3" -o $qDB = "caePb2" -o $qDB = "caeRem3" -o $qDB = "caeJap1" -o $qDB = "priPac1" ]; then geneTbl=blastCe6SG echo hgsql -N -e \"'select * from '$geneTbl\;\" ${qDB} hgsql -N -e "select * from $geneTbl" ${qDB} | cut -f 2-100 \ > /scratch/tmp/$qDB.tmp.psl mrnaToGene -allCds /scratch/tmp/$qDB.tmp.psl stdout \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/$qDB.tmp.gz rm -f /scratch/tmp/$qDB.tmp.psl mv /scratch/tmp/$qDB.tmp.gz genes/$qDB.gp.gz elif [ $qDB = "ce6" ]; then geneTbl=sangerGene echo hgsql -N -e \"'select * from '$geneTbl\;\" ${qDB} hgsql -N -e "select * from $geneTbl" ${qDB} | cut -f 1-10 \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/$qDB.tmp.gz mv /scratch/tmp/$qDB.tmp.gz genes/$qDB.gp.gz fi done ssh kkstore06 cd /cluster/data/ce6/bed/multiz6way/frames cat ../maf/*.maf | nice -n +19 genePredToMafFrames ce6 \ stdin stdout ce6 genes/ce6.gp.gz cb3 \ genes/cb3.gp.gz caePb2 genes/caePb2.gp.gz caeRem3 \ genes/caeRem3.gp.gz caeJap1 genes/caeJap1.gp.gz \ priPac1 genes/priPac1.gp.gz \ | gzip > multiz6way.mafFrames.gz ssh hgwdev cd /cluster/data/ce6/bed/multiz6way/frames nice -n +19 hgLoadMafFrames ce6 multiz6wayFrames multiz6way.mafFrames.gz ########################################################################## # verify all.joiner entries (DONE - 2008-06-20 - Hiram) # try to get joinerCheck tests clean output on these commands ssh hgwdev cd ~/kent/src/hg/makeDb/schema joinerCheck -database=ce6 -tableCoverage all.joiner joinerCheck -database=ce6 -keys all.joiner joinerCheck -database=ce6 -times all.joiner ########################################################################## # make downloads (DONE - 2008-06-20 - Hiram) # verify all tracks have html pages for their trackDb entries cd /cluster/data/ce6 /cluster/bin/scripts/makeDownloads.pl ce6 # fix the README files # re-build the upstream files: cd /cluster/data/ce6/goldenPath/bigZips for size in 1000 2000 5000 do echo $size featureBits ce6 sangerGene:upstream:$size -fa=stdout \ | gzip -c > sangerGene.upstream$size.fa.gz done ########################################################################## # create pushQ entry (DONE - 2008-06-20 - Hiram) # verify all tracks have html pages for their trackDb entries ssh hgwdev cd /cluster/data/ce6/jkStuff /cluster/bin/scripts/makePushQSql.pl ce6 > ce6.pushQ.sql ce6 does not have seq Could not tell (from trackDb, all.joiner and hardcoded lists of supporting and genbank tables) which tracks to assign these tables to: blastSGPep01 blastSGRef01 multiz6wayFrames ############################################################################# # Create 6-way downloads (DONE - 2008-06-23 - Hiram) ssh hgwdev mkdir /cluster/data/ce6/goldenPath/multiz6way mkdir /cluster/data/ce6/goldenPath/phastCons6way cd /cluster/data/ce6/goldenPath/phastCons6way ln -s ../../bed/multiz6way/phastCons6wayScores/*.gz . ln -s ../../bed/multiz6way/cons/ave.cons.mod 6way.conserved.mod ln -s ../../bed/multiz6way/cons/ave.noncons.mod 6way.non-conserved.mod md5sum *.gz *.mod > md5sum.txt # copy a README and fix it up for here: cp /usr/local/apache/htdocs/goldenPath/calJac1/phastCons9way/README.txt . mkdir /cluster/data/ce6/goldenPath/multiz6way cd /cluster/data/ce6/goldenPath/multiz6way cp /usr/local/apache/htdocs/goldenPath/calJac1/multiz9way/README.txt . # edit that README.txt to be correct for this 6-way alignment ssh kkstore06 cd /cluster/data/ce6/goldenPath/multiz6way ln -s ../../bed/multiz6way/6way.nh ./ce6.6way.nh cp -p ../../bed/multiz6way//anno/maf/*.maf . gzip *.maf ssh hgwdev cd /cluster/data/ce6/goldenPath/multiz6way # creating upstream files from sangerGene, bash script: cat << '_EOF_' > mkUpstream.sh #!/bin/bash DB=ce6 GENE=sangerGene NWAY=multiz6way export DB GENE for S in 1000 2000 5000 do echo "making upstream${S}.maf" featureBits ${DB} ${GENE}:upstream:${S} -fa=/dev/null -bed=stdout \ | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \ | $HOME/kent/src/hg/ratStuff/mafFrags/mafFrags ${DB} ${NWAY} \ stdin stdout \ -orgs=/cluster/data/${DB}/bed/${NWAY}/species.lst \ | gzip -c > upstream${S}.maf.gz echo "done upstream${S}.maf.gz" done '_EOF_' # << happy emacs chmod +x ./mkUpstream.sh time nice -n +19 ./mkUpstream.sh # real 119m5.562s # -rw-rw-r-- 1 42975041 Mar 28 14:27 upstream1000.maf.gz # -rw-rw-r-- 1 76363192 Mar 28 15:03 upstream2000.maf.gz # -rw-rw-r-- 1 303870318 Mar 28 15:42 upstream5000.maf.gz # check the names in these upstream files to ensure sanity: zcat upstream1000.maf.gz | grep "^s " | awk '{print $2}' \ | sort | uniq -c | sort -rn | less # should be a list of the other 4 species with a high count, # then sangerGene names, e.g.: # 10134 priPac1 # 10134 cb3 # 10134 caeRem3 # 10134 caePb2 # 10134 caeJap1 # 1 cTel55X.1a.1 # 1 ZK993.1 # 1 ZK973.9 # 1 ZK973.8 # 1 ZK973.3 # ... etc ... ssh kkstore06 cd /cluster/data/ce6/goldenPath/multiz6way md5sum *.gz *.nh > md5sum.txt ssh hgwdev cd /cluster/data/ce6/goldenPath/phastCons6way mkdir /usr/local/apache/htdocs/goldenPath/ce6/phastCons6way ln -s `pwd`/* /usr/local/apache/htdocs/goldenPath/ce6/phastCons6way cd /cluster/data/ce6/goldenPath/multiz6way mkdir /usr/local/apache/htdocs/goldenPath/ce6/multiz6way ln -s `pwd`/* /usr/local/apache/htdocs/goldenPath/ce6/multiz6way # if your ln -s `pwd`/* made extra links to files you don't want there, # check the goldenPath locations and remove those extra links ########################################################################## # Make protein blast runs for Gene Sorter and Known Genes on Hg18 ssh hgwdev mkdir /cluster/data/ce6/bed/hgNearBlastp cd /cluster/data/ce6/bed/hgNearBlastp # Make sure there are no joinerCheck errors at this point, because # if left here they can spread via doHgNearBlastp: runJoiner.csh ce6 sangerPep # ce6.kimExpDistance.query - hits 13356000 of 19134000 ok # ce6.kimExpDistance.target - hits 12973822 of 19134000 ok # ce6.orfToGene.name - hits 30296 of 30296 ok # ce6.sangerBlastTab.query - hits 1255312 of 1255312 ok # ce6.sangerBlastTab.target - hits 1255312 of 1255312 ok # ce6.sangerCanonical.transcript - hits 20051 of 20051 ok # ce6.sangerIsoforms.transcript - hits 30296 of 30296 ok # ce6.sangerLinks.orfName - hits 30115 of 30115 ok # ce6.sangerPep.name - hits 23771 of 23771 ok # ce6.sangerToRefSeq.name - hits 29995 of 29995 ok # ce6.sangerGeneToWBGeneID.sangerGene - hits 30115 of 30115 ok pepPredToFa ce6 sangerPep ce6.sangerPep.faa cat << '_EOF_' > config.ra # Latest human vs. this ce6 worm for Human Gene Sorter targetGenesetPrefix known targetDb hg18 queryDbs ce6 hg18Fa /cluster/data/hg18/bed/ucsc.10/ucscGenes.faa ce6Fa /cluster/data/ce6/bed/hgNearBlastp/ce6.sangerPep.faa buildDir /cluster/data/ce6/bed/hgNearBlastp scratchDir /cluster/data/ce6/bed/hgNearBlastp/tmpHgNearBlastp '_EOF_' time nice -n +19 /cluster/bin/scripts/doHgNearBlastp.pl -workhorse=hgwdev \ -noLoad -verbose=2 config.ra > do.log 2>&1 & ########################################################################## ## creating xxBlastTab tables (DONE - 2008-07-11 - Hiram) # Make sure there are no joinerCheck errors at this point, because # if left here they can spread via doHgNearBlastp: ssh hgwdev runJoiner.csh ce6 sangerPep # ce6.kimExpDistance.query - hits 13356000 of 19134000 ok # ce6.kimExpDistance.target - hits 12973822 of 19134000 ok # ce6.orfToGene.name - hits 30296 of 30296 ok # ce6.sangerBlastTab.query - hits 1255312 of 1255312 ok # ce6.sangerBlastTab.target - hits 1255312 of 1255312 ok # ce6.sangerCanonical.transcript - hits 20051 of 20051 ok # ce6.sangerIsoforms.transcript - hits 30296 of 30296 ok # ce6.sangerLinks.orfName - hits 30115 of 30115 ok # ce6.sangerPep.name - hits 23771 of 23771 ok # ce6.sangerToRefSeq.name - hits 29995 of 29995 ok # ce6.sangerGeneToWBGeneID.sangerGene - hits 30115 of 30115 ok ssh hgwdev mkdir /cluster/data/ce6/bed/hgNearBlastp/080711 cd /cluster/data/ce6/bed/hgNearBlastp/080711 # Get the proteins used by the other hgNear organisms: pepPredToFa hg18 knownGenePep hg18.known.faa pepPredToFa mm9 knownGenePep mm9.known.faa pepPredToFa rn4 knownGenePep rn4.known.faa pepPredToFa danRer5 ensPep danRer5.ensPep.faa pepPredToFa dm3 flyBasePep dm3.flyBasePep.faa pepPredToFa ce6 sangerPep ce6.sangerPep.faa pepPredToFa sacCer1 sgdPep sacCer1.sgdPep.faa # sanity check, number of lines in each faa file wc -l *.faa # 244343 ce6.sangerPep.faa # 345537 danRer5.ensPep.faa # 268201 dm3.flyBasePep.faa # 526563 hg18.known.faa # 453565 mm9.known.faa # 86220 rn4.known.faa # 64956 sacCer1.sgdPep.faa # sanity check, count the number of protein sequences in each for F in *.faa do echo -n "${F} " grep "^>" ${F} | wc -l done # ce6.sangerPep.faa 23771 # danRer5.ensPep.faa 31743 # dm3.flyBasePep.faa 20736 # hg18.known.faa 45480 # mm9.known.faa 39489 # rn4.known.faa 8160 # sacCer1.sgdPep.faa 5766 cd /cluster/data/ce6/bed/hgNearBlastp cat << '_EOF_' > config.ra # Latest worm vs. other Gene Sorter orgs: # human, mouse, rat, zebrafish, fly, yeast targetGenesetPrefix sanger targetDb ce6 queryDbs hg18 mm9 rn4 danRer5 dm3 sacCer1 recipBest hg18 mm9 rn4 danRer5 dm3 sacCer1 ce6Fa /cluster/data/ce6/bed/hgNearBlastp/080711/ce6.sangerPep.faa hg18Fa /cluster/data/ce6/bed/hgNearBlastp/080711/hg18.known.faa mm9Fa /cluster/data/ce6/bed/hgNearBlastp/080711/mm9.known.faa rn4Fa /cluster/data/ce6/bed/hgNearBlastp/080711/rn4.known.faa danRer5Fa /cluster/data/ce6/bed/hgNearBlastp/080711/danRer5.ensPep.faa dm3Fa /cluster/data/ce6/bed/hgNearBlastp/080711/dm3.flyBasePep.faa sacCer1Fa /cluster/data/ce6/bed/hgNearBlastp/080711/sacCer1.sgdPep.faa buildDir /cluster/data/ce6/bed/hgNearBlastp/080711 scratchDir /san/sanvol1/scratch/ce6HgNearBlastp '_EOF_' # << happy emacs # Run this in a screen since it is going to take awhile # Run with -noLoad so we can eyeball files. # Later overload other databases' dmBlastTab tables. ssh kkstore06 screen cd time nice -n +19 doHgNearBlastp.pl -workhorse=hgwdev -clusterHub=pk \ -noLoad config.ra > do.log 2>&1 & # watch progress in do.log # load the blast tabs into ce6 ssh hgwdev cd /cluster/data/ce6/bed/hgNearBlastp/080711 for D in danRer5 dm3 hg18 mm9 rn4 sacCer1 do cd run.ce6.$D grep hgLoadBlastTab loadPairwise.csh ./loadPairwise.csh cd .. done ############################################################################# # Lift nucleosome data from ce4 to ce6 (DONE - 2008-09-28,30 - Hiram) mkdir /hive/data/genomes/ce6/bed/monoNucleosomes cd /hive/data/genomes/ce6/bed/monoNucleosomes ln -s ../../../ce4/bed/monoNucleosomes/chrI-M.for.paired.bed.gz \ ce4.nucleosomeControl.fwd.bed.gz ln -s ../../../ce4/bed/monoNucleosomes/chrI-M.rev.paired.bed.gz \ ce4.nucleosomeControl.rev.bed.gz ln -s \ /usr/local/apache/htdocs/goldenPath/ce4/liftOver/ce4ToCe6.over.chain.gz \ ce4ToCe6.over.chain.gz # The name filed isn't being parsed correctly because of the blanks # take out the blanks so liftOver can read them, put them back # in after lift zcat ce4.nucleosomeControl.fwd.bed.gz | sed -e "s/n = /n=/" \ | liftOver stdin ce4ToCe6.over.chain.gz stdout \ ce6.nucleosomeControl.fwd.unMapped \ | sed -e "s/n=/n = /" | gzip > ce6.nucleosomeControl.fwd.bed.gz # only 5 items do not map correctly zcat ce4.nucleosomeControl.rev.bed.gz | sed -e "s/n = /n=/" \ | liftOver stdin ce4ToCe6.over.chain.gz stdout \ ce6.nucleosomeControl.rev.unMapped \ | sed -e "s/n=/n = /" | gzip > ce6.nucleosomeControl.rev.bed.gz # 71 items do not map correctly LOAD="hgLoadBed -tab -noNameIx ce6" zcat ce6.nucleosomeControl.fwd.bed.gz ce6.nucleosomeControl.rev.bed.gz \ | ${LOAD} nucleosomeControl stdin # Loaded 28951421 elements of size 12 # Collapsed nucleosome track files by chromosome: ln -s ../../../ce4/bed/monoNucleosomes/chrI-M.for.clean.bed.gz \ ./ce4.nucleosomeFragmentsSense.bed.gz ln -s ../../../ce4/bed/monoNucleosomes/chrI-M.rev.clean.bed.gz \ ./ce4.nucleosomeFragmentsAntisense.bed.gz zcat ce4.nucleosomeFragmentsSense.bed.gz | sed -e "s/n = /n=/" \ | liftOver stdin ce4ToCe6.over.chain.gz stdout \ ce6.nucleosomeControlFragmentsSense.unMapped \ | sed -e "s/n=/n = /" | gzip > ce6.nucleosomeFragmentsSense.bed.gz # four items do not map zcat ce4.nucleosomeFragmentsAntisense.bed.gz | sed -e "s/n = /n=/" \ | liftOver stdin ce4ToCe6.over.chain.gz stdout \ ce6.nucleosomeControlFragmentsAntisense.unMapped \ | sed -e "s/n=/n = /" | gzip > ce6.nucleosomeFragmentsAntisense.bed.gz # 34 items do not map zcat ce6.nucleosomeFragmentsSense.bed.gz \ | ${LOAD} nucleosomeFragmentsSense stdin # Loaded 11279598 elements of size 12 zcat ce6.nucleosomeFragmentsAntisense.bed.gz \ | ${LOAD} nucleosomeFragmentsAntisense stdin # Loaded 11269358 elements of size 12 # these three tracks grouped into the composite: "nucleosome" # labels: # composite container: Nucleosome predictions from SOLidD core alignments # Control - : mononucleosome control # fragments + : mononucleosomal fragments, sense strand reads # fragments - : mononucleosomal fragments, antisense strand reads # Positioning stringency wig track ln -s \ ../../../ce4/bed/monoNucleosomes/chrI-M_positioning_stringency.wig.gz \ ./ce4.nucleosomeStringency.wigVariable.txt.gz zcat ce4.nucleosomeStringency.wigVariable.txt.gz \ | varStepToBedGraph.pl stdin | liftOver stdin ce4ToCe6.over.chain.gz \ stdout ce6.nucleosomeStringency.unMapped \ | gzip > ce6.nucleosomeStringency.bedGraph.gz # 11 items do not lift cat << '_EOF_' > bedGraphToVarStep.sh #!/bin/sh SPAN=1 SRC0=ce4.nucleosomeStringency.wigVariable.txt.gz SRC=ce6.nucleosomeStringency.bedGraph.gz R=ce6.nucleosomeStringency.wigVariable.txt rm -f ${R} zcat ${SRC0} | egrep "^browser|^track" > ${R} zcat ${SRC} | grep "^chr" | cut -f1 | sort -u > chr.list cat chr.list | while read C do echo "variableStep chrom=${C} span=${SPAN}" >> ${R} zcat "${SRC}" \ | awk '{if (match($1,"^'"${C}"'$")) { print } }' | sort -k2n | awk ' { printf "%d\t%g\n", $2+1, $4 } ' >> ${R} done rm -f chr.list '_EOF_' # << happy emacs time ./bedGraphToVarStep.sh # real 35m15.201s gzip ce6.nucleosomeStringency.wigVariable.txt zcat ce6.nucleosomeStringency.wigVariable.txt.gz \ | wigEncode stdin nucleosomeStringency.wig nucleosomeStringency.wib # Converted stdin, upper limit 1.00, lower limit 0.00 ln -s `pwd`/nucleosomeStringency.wib /gbdb/ce6/wib # Positioning stringency wig track hgLoadWiggle ce6 nucleosomeStringency nucleosomeStringency.wig # NSome Stringency # Nucleosome Positioning Stringency from SOLiD core alignments # Adjusted nucleosome coverage wig track: ln -s \ ../../../ce4/bed/monoNucleosomes/adjusted_coverage_chrI-M.wig.gz \ ./ce4.nucleosomeAdjustedCoverage.wigVariable.txt.gz zcat ce4.nucleosomeAdjustedCoverage.wigVariable.txt.gz \ | varStepToBedGraph.pl stdin | liftOver stdin ce4ToCe6.over.chain.gz \ stdout ce6.nucleosomeAdjustedCoverage.unMapped \ | gzip > ce6.nucleosomeAdjustedCoverage.bedGraph.gz time ./bedGraphToVarStep.sh \ ce4.nucleosomeAdjustedCoverage.wigVariable.txt.gz \ ce6.nucleosomeAdjustedCoverage.bedGraph.gz \ ce6.nucleosomeAdjustedCoverage.wigVariable.txt # real 27m40.408s time gzip ce6.nucleosomeAdjustedCoverage.wigVariable.txt # real 3m27.146s time wigEncode ce6.nucleosomeAdjustedCoverage.wigVariable.txt.gz \ nucleosomeAdjustedCoverage.wig nucleosomeAdjustedCoverage.wib # real 1m25.770s # Converted ce6.nucleosomeAdjustedCoverage.wigVariable.txt.gz # upper limit 8.76, lower limit -11.93 ln -s `pwd`/nucleosomeAdjustedCoverage.wib /gbdb/ce6/wib hgLoadWiggle ce6 nucleosomeAdjustedCoverage nucleosomeAdjustedCoverage.wig # Adj NSome Covrg # Adjusted nucleosome coverage (25bp) # MNase Coverage # Micrococcal nuclease control coverage # Cntl MNase +Covg # Micrococcal nuclease control coverage, sense strand reads # Cntl MNase -Covg # Micrococcal nuclease control coverage, antisense strand reads zcat ce6.nucleosomeControl.fwd.bed.gz | sort -k1,1 -k2,2n \ | bedItemOverlapCount ce6 stdin \ | wigEncode stdin nucleosomeControlSenseCoverage.wig \ nucleosomeControlSenseCoverage.wib # Converted stdin, upper limit 142.00, lower limit 1.00 ln -s `pwd`/nucleosomeControlSenseCoverage.wib /gbdb/ce6/wib zcat ce6.nucleosomeControl.rev.bed.gz | sort -k1,1 -k2,2n \ | bedItemOverlapCount ce6 stdin \ | wigEncode stdin nucleosomeControlAntisenseCoverage.wig \ nucleosomeControlAntisenseCoverage.wib # Converted stdin, upper limit 145.00, lower limit 1.00 ln -s `pwd`/nucleosomeControlAntisenseCoverage.wib /gbdb/ce6/wib hgLoadWiggle ce6 nucleosomeControlSenseCoverage \ nucleosomeControlSenseCoverage.wig hgLoadWiggle ce6 nucleosomeControlAntisenseCoverage \ nucleosomeControlAntisenseCoverage.wig mkdir ce4.monoNucleosomes cd ce4.monoNucleosomes ln -s ../../../../ce4/bed/monoNucleosomes/stanford/clean/* . zcat ce4.monoNucleosomes/chr*.forward.bed.gz \ | liftOver stdin ce4ToCe6.over.chain.gz \ stdout ce6.monoNucleosomesSense.unMapped \ | gzip > ce6.monoNucleosomesSense.bed.gz zcat ce4.monoNucleosomes/chr*.reverse.bed.gz \ | liftOver stdin ce4ToCe6.over.chain.gz \ stdout ce6.monoNucleosomesAntiSense.unMapped \ | gzip > ce6.monoNucleosomesAntiSense.bed.gz bedItemOverlapCount ce6 ce6.monoNucleosomesSense.bed.gz \ | wigEncode stdin monoNucleosomesSense.wig monoNucleosomesSense.wib # Converted stdin, upper limit 16195.00, lower limit 1.00 bedItemOverlapCount ce6 ce6.monoNucleosomesAntiSense.bed.gz \ | wigEncode stdin monoNucleosomesAntiSense.wig \ monoNucleosomesAntiSense.wib # Converted stdin, upper limit 19137.00, lower limit 1.00 ln -s `pwd`/monoNucleosomesSense.wib /gbdb/ce6/wib ln -s `pwd`/monoNucleosomesAntiSense.wib /gbdb/ce6/wib # for downloads: bedItemOverlapCount ce6 ce6.monoNucleosomesSense.bed.gz \ | gzip --rsyncable > monoNucleosomesSense.wigVariable.gz bedItemOverlapCount ce6 ce6.monoNucleosomesAntiSense.bed.gz \ | gzip --rsyncable > monoNucleosomesAntiSense.wigVariable.gz # NSome Coverage # Coverage of nucleosome predictions from SOLiD core alignments # Control # mononucleosome control hgLoadWiggle ce6 monoNucleosomesSense monoNucleosomesSense.wig # NSome Core +Covg # Coverage of mononucleosomal fragments, sense strand reads hgLoadWiggle ce6 monoNucleosomesAntiSense monoNucleosomesAntiSense.wig # NSome Core -Covg # Coverage of mononucleosomal fragments, antisense strand reads ######################################################################### # lift 25mers repeat track from ce4 (DONE - 2008-09-30 - Hiram) mkdir /cluster/data/ce6/bed/25mersRepeats cd /cluster/data/ce6/bed/25mersRepeats ln -s /cluster/data/ce4/bed/25mersRepeats/withScore.bed \ ./ce4.25mersRepeats.bed ln -s \ /usr/local/apache/htdocs/goldenPath/ce4/liftOver/ce4ToCe6.over.chain.gz \ ce4ToCe6.over.chain.gz liftOver ce4.25mersRepeats.bed ce4ToCe6.over.chain.gz \ ce6.25mersRepeats.bed ce6.unMapped # no elements failed the eliftOver hgLoadBed ce6 25mersRepeats ce6.25mersRepeats.bed # Loaded 757742 elements of size 9 # clean up rm bed.tab gzip ce6.25mersRepeats.bed ######################################################################### # hgPal downloads (DONE braney 2008-09-30) ssh hgwdev screen bash rm -rf /cluster/data/ce6/bed/multiz6way/pal mkdir /cluster/data/ce6/bed/multiz6way/pal cd /cluster/data/ce6/bed/multiz6way/pal mz=multiz6way gp=refGene db=ce6 echo $db > order.lst echo "select settings from trackDb where tableName='$mz'" | \ hgsql $db | sed 's/\\n/\n/g'| grep speciesOrder | tr ' ' '\n' | \ tail -n +2 >> order.lst mkdir exonAA exonNuc ppredAA ppredNuc for j in `sort -nk 2 /cluster/data/$db/chrom.sizes | awk '{print $1}'` do echo "date" echo "mafGene -chrom=$j $db $mz $gp order.lst stdout | \ gzip -c > ppredAA/$j.ppredAA.fa.gz" echo "mafGene -chrom=$j -noTrans $db $mz $gp order.lst stdout | \ gzip -c > ppredNuc/$j.ppredNuc.fa.gz" echo "mafGene -chrom=$j -exons -noTrans $db $mz $gp order.lst stdout | \ gzip -c > exonNuc/$j.exonNuc.fa.gz" echo "mafGene -chrom=$j -exons $db $mz $gp order.lst stdout | \ gzip -c > exonAA/$j.exonAA.fa.gz" done > refGene.jobs time sh -x refGene.jobs > refGene.job.log 2>&1 & sleep 1 tail -f refGene.job.log # real 14m44.986s # user 2m26.113s # sys 1m41.837s zcat exonAA/*.gz | gzip -c > $gp.$mz.exonAA.fa.gz zcat exonNuc/*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz zcat ppredAA/*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz zcat ppredNuc/*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz rm -rf exonAA exonNuc ppredAA ppredNuc # we're only distributing exons at the moment pd=/usr/local/apache/htdocs/goldenPath/$db/$mz rm -rf $pd/$gp.exonAA.fa.gz $pd/$gp.exonNuc.fa.gz ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz mz=multiz6way gp=sangerGene db=ce6 mkdir exonAA exonNuc ppredAA ppredNuc for j in `sort -nk 2 /cluster/data/$db/chrom.sizes | awk '{print $1}'` do echo "date" echo "mafGene -chrom=$j $db $mz $gp order.lst stdout | \ gzip -c > ppredAA/$j.ppredAA.fa.gz" echo "mafGene -chrom=$j -noTrans $db $mz $gp order.lst stdout | \ gzip -c > ppredNuc/$j.ppredNuc.fa.gz" echo "mafGene -chrom=$j -exons -noTrans $db $mz $gp order.lst stdout | \ gzip -c > exonNuc/$j.exonNuc.fa.gz" echo "mafGene -chrom=$j -exons $db $mz $gp order.lst stdout | \ gzip -c > exonAA/$j.exonAA.fa.gz" done > $gp.$mz.jobs time sh -x $gp.$mz.jobs > $gp.$mz.job.log 2>&1 & sleep 1 tail -f $gp.$mz.job.log # real 16m2.266s # user 2m43.764s # sys 2m7.173s zcat exonAA/c*.gz | gzip -c > $gp.$mz.exonAA.fa.gz zcat exonNuc/c*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz zcat ppredAA/c*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz zcat ppredNuc/c*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz rm -rf exonAA exonNuc ppredAA ppredNuc # we're only distributing exons at the moment pd=/usr/local/apache/htdocs/goldenPath/ce6/multiz6way ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz ######################################################################### ################################################ # AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd) update genbank.conf: ce6.upstreamGeneTbl = refGene ce6.upstreamMaf = multiz6way /hive/data/genomes/ce6/bed/multiz6way/species.lst ############################################################################ # CE6->CE7 LIFTOVER (DONE - 2008-06-24 - Hiram) cd /hive/data/genomes/ce6 # test procedure with -debug first doSameSpeciesLiftOver.pl -bigClusterHub=swarm -workhorse=hgwdev \ -ooc /hive/data/genomes/ce6/jkStuff/11.ooc ce6 ce7 -debug cd bed/blat.ce7.2009-07-28 time nice -n +19 doSameSpeciesLiftOver.pl -bigClusterHub=swarm \ -workhorse=hgwdev \ -ooc /hive/data/genomes/ce6/jkStuff/11.ooc ce6 ce7 > do.log 2>&1 & # real 8m7.833s ######################################################################### # ALG Binding sites (DONE - 2010-01-14 - Hiram) # From the Gene Yeo lab: # fetch files T="http://www.snl.salk.edu/~lovci/Amy/ce6/alltheexperiments/for_ucsc" one() { wget --timestamping \ "${T}/$1" -O $1 } one all_L3_L4.L3_L4.ce6.ingenes.both_strands.reannotated.BED.gz one all_L3_L4.L3_L4.ce6.ingenes.both_strands.reannotated.BED.gz one MT1.neg.off.WIG.gz one MT1.pos.off.WIG.gz one MT2.neg.off.WIG.gz one MT2.pos.off.WIG.gz one MT3.neg.off.WIG.gz one MT3.pos.off.WIG.gz one WT1.neg.off.WIG.gz one WT1.pos.off.WIG.gz one WT2.neg.off.WIG.gz one WT2.pos.off.WIG.gz one WT3.neg.off.WIG.gz one WT3.pos.off.WIG.gz one WT_region_specific_clusters.weight_normal.3.rp.bed.WT.good_clusters.25.gz # encode and load wiggle tracks one() { ID=$1 NP=$2 RF=$3 UC=$4 LC=`echo $ID | tr '[A-Z]' '[a-z]'` zcat $ID.$NP.off.WIG.gz \ | wigEncode stdin ${LC}${UC}.wig ${LC}${UC}.wib > ${LC}$UC.log 2>&1 ln -s `pwd`/${LC}${UC}.wib /gbdb/ce6/wib hgLoadWiggle ce6 ${LC}${UC} ${LC}${UC}.wig } one MT1 neg reverse Reverse one MT1 pos forward Forward one MT2 neg reverse Reverse one MT2 pos forward Forward one MT3 neg reverse Reverse one MT3 pos forward Forward one WT1 neg reverse Reverse one WT1 pos forward Forward one WT2 neg reverse Reverse one WT2 pos forward Forward one WT3 neg reverse Reverse one WT3 pos forward Forward # load BED tables: zcat WT_region_specific_clusters.weight_normal.3.rp.bed.WT.good_clusters.25.gz \ | grep -v track \ | awk -F'\t' '{printf "%s %d %d %s 1 %s\n", $1, $2, $3, $4, $6}' \ | hgLoadBed -noNameIx ce6 algBindSites stdin zcat all_L3_L4.L3_L4.ce6.ingenes.both_strands.reannotated.BED.gz \ | grep -v track | awk -F'\t' '{printf "%s %d %d %s 0 %s\n", $1, $2, $3, $4, $6}' | hgLoadBed -noNameIx ce6 algBindGenic stdin ############################################################################ # CE6->CE4 LIFTOVER (working - 2010-04-15 - chin) mkdir /hive/data/genomes/ce6/bed/blat.ce4.2010-04-15 cd /hive/data/genomes/ce6/bed/blat.ce4.2010-04-15 # -debug run to create run dir, preview scripts... doSameSpeciesLiftOver.pl -debug -ooc /scratch/data/ce6/11.ooc \ ce6 ce4 # Real run: time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \ -ooc /scratch/data/ce6/11.ooc \ -bigClusterHub=pk -dbHost=hgwdev -workhorse=hgwdev \ ce6 ce4 > do.log 2>&1 & # real 6m22.653s ######################################################################### # RE-BUILD HGNEAR PROTEIN BLAST TABLES -- TARGET (DONE 8/6/10, Fan) ssh hgwdev mkdir -p /hive/data/genomes/ce6/bed/hgNearBlastp/100806 cd /hive/data/genomes/ce6/bed/hgNearBlastp/100806 cat << _EOF_ > config.ra # Latest worm vs. other Gene Sorter orgs: # human, mouse, rat, zebrafish, fly, yeast targetGenesetPrefix sanger targetDb ce6 queryDbs hg19 mm9 rn4 danRer6 dm3 sacCer2 recipBest hg19 mm9 rn4 danRer6 dm3 sacCer2 ce6Fa /hive/data/genomes/ce6/bed/hgNearBlastp/080711/ce6.sangerPep.faa rn4Fa /hive/data/genomes/rn4/bed/blastp/known.faa hg19Fa /hive/data/genomes/hg19/bed/ucsc.12/ucscGenes.faa mm9Fa /hive/data/genomes/mm9/bed/ucsc.12/ucscGenes.faa danRer6Fa /hive/data/genomes/danRer6/bed/blastp/danRer6.ensPep.faa dm3Fa /hive/data/genomes/dm3/bed/flybase5.3/flyBasePep.fa sacCer2Fa /hive/data/genomes/sacCer2/bed/hgNearBlastp/090218/sgdPep.faa buildDir /cluster/data/ce6/bed/hgNearBlastp/100806 scratchDir /hive/data/genomes/ce6/bed/hgNearBlastp/100806/tmp _EOF_ doHgNearBlastp.pl -targetOnly config.ra >& do.log & tail -f do.log # *** All done! # *** Check these tables in ce6: # *** sangerBlastTab hgBlastTab mmBlastTab rnBlastTab drBlastTab dmBlastTab scBlastTab ############################################################################ # LIFTOVER TO ce9 (DONE - 2010-10-18 - Hiram ) mkdir /hive/data/genomes/ce6/bed/blat.ce9.2010-10-18 cd /hive/data/genomes/ce6/bed/blat.ce9.2010-10-18 # -debug run to create run dir, preview scripts... doSameSpeciesLiftOver.pl \ -buildDir=`pwd` \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ -ooc=/hive/data/genomes/ce6/jkStuff/11.ooc -debug ce6 ce9 # Real run: time nice -n +19 doSameSpeciesLiftOver.pl \ -buildDir=`pwd` \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ -ooc=/hive/data/genomes/ce6/jkStuff/11.ooc ce6 ce9 > do.log 2>&1 # real 4m47.692s ############################################################################# # LIFTOVER TO ce10 (DONE - 2011-05-24 - Hiram ) mkdir /hive/data/genomes/ce6/bed/blat.ce10.2011-05-24 cd /hive/data/genomes/ce6/bed/blat.ce10.2011-05-24 # -debug run to create run dir, preview scripts... doSameSpeciesLiftOver.pl \ -buildDir=`pwd` \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ -ooc=/hive/data/genomes/ce6/jkStuff/11.ooc -debug ce6 ce10 # Real run: time nice -n +19 doSameSpeciesLiftOver.pl \ -buildDir=`pwd` \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ -ooc=/hive/data/genomes/ce6/jkStuff/11.ooc ce6 ce10 > do.log 2>&1 # about 4 minutes # verify it works on genome-test ############################################################################# ##########################################################################pubStart # Publications track (DONE - 04-27-12 - Max) # article download and conversion is run every night on hgwdev: # 22 22 * * * /hive/data/inside/literature/pubtools/pubCronDailyUpdate.sh # the script downloads files into /hive/data/outside/literature/{PubMedCentral,ElsevierConsyn}/ # then converts them to text into /hive/data/outside/literature/{pmc,elsevier} # all configuration of the pipeline is in /hive/data/inside/literature/pubtools/lib/pubConf.py # data processing was run manually like this export PATH=/cluster/home/max/bin/x86_64:/cluster/bin/x86_64:/cluster/home/max/software/bin/:/cluster/software/bin:/cluster/home/max/projects/pubtools:/cluster/home/max/bin/x86_64:/hive/groups/recon/local/bin:/usr/local/bin:/usr/bin:/bin:/usr/bin/X11:/cluster/home/max/usr/src/scripts:/cluster/home/max/usr/src/oneshot:/cluster/home/max/bin:/cluster/bin/scripts:.:/cluster/home/max/usr/bin:/usr/lib64/qt-3.3/bin:/usr/kerberos/bin:/usr/local/bin:/bin:/usr/bin:/usr/lpp/mmfs/bin/:/opt/dell/srvadmin/bin:/cluster/bin/scripts:/hive/users/hiram/cloud/ec2-api-tools-1.3-51254/bin:/cluster/home/max/bin:/usr/bin/X11:/usr/java/jdk1.6.0_20/bin:/cluster/home/max/bin:/hive/data/inside/literature/pubtools/ # pmc cd /hive/data/inside/literature/pubtools/runs/pmcBlat/ pubBlat init /hive/data/inside/literature/blat/pmc/ /hive/data/inside/literature/text/pmc ssh swarm cd /hive/data/inside/literature/pubtools/runs/pmcBlat/ pubBlat steps:annot-tables exit pubBlat load # elsevier cd /hive/data/inside/literature/pubtools/runs/elsBlat/ pubBlat init /hive/data/inside/literature/blat/elsevier/ /hive/data/inside/literature/text/elsevier ssh swarm cd /hive/data/inside/literature/pubtools/runs/elsBlat/ pubBlat steps:annot-tables exit pubBlat load #--pubEnd