# for emacs: -*- mode: sh; -*- # Caenorhabditis elegans # Washington University School of Medicine GSC and Sanger Institute WS210 ######################################################################### # DOWNLOAD SEQUENCE (DONE - 2010-07-08 - Hiram) mkdir /hive/data/genomes/ce9 cd /hive/data/genomes/ce9 mkdir ws210 cd ws210 wget --no-parent --timestamping -m -nH --cut-dirs=5 \ ftp://ftp.sanger.ac.uk/pub/wormbase/WS210/genomes/c_elegans/ # about 11 minutes mkdir acedb cd acedb wget --no-parent --timestamping -m -nH --cut-dirs=4 \ ftp://ftp.sanger.ac.uk/pub/wormbase/WS210/acedb/ ######################################################################### # NORMALIZE SEQUENCE NAMES TO BEGIN WITH chr (DONE - 2010-07-08 - Hiram) mkdir /hive/data/genomes/ce9/sanger cd /hive/data/genomes/ce9/sanger # Fix fasta names: zcat ../ws210/sequences/dna/CHR*[XVAI].dna.gz \ | sed -e '/^$/ d; s/^>CHROMOSOME_MtDNA/>chrM/; s/^>CHROMOSOME_/>chr/;' \ | gzip -c > UCSC.fa.gz faCount UCSC.fa.gz #seq len A C G T N cpg chrI 15072421 4835939 2695879 2692150 4848453 0 503521 chrII 15279323 4878195 2769216 2762198 4869714 0 492149 chrIII 13783685 4444652 2449141 2466322 4423570 0 459669 chrIV 17493784 5711040 3034767 3017008 5730969 0 522372 chrM 13794 4335 1225 2055 6179 0 110 chrV 20924143 6750393 3712058 3701397 6760295 0 638983 chrX 17718854 5747199 3119702 3117868 5734085 0 514715 total 100286004 32371753 17781988 17758998 323732650 3131519 # Make sure we get the same sizes from this command: zcat ../ws210/sequences/dna/CHR*[XVAI].dna.gz | sed -e '/^$/ d;' \ | faCount stdin #seq len A C G T N cpg CHROMOSOME_I 15072421 4835939 2695879 2692150 4848453 0 503521 CHROMOSOME_II 15279323 4878195 2769216 2762198 4869714 0 492149 CHROMOSOME_III 13783685 4444652 2449141 2466322 4423570 0 459669 CHROMOSOME_IV 17493784 5711040 3034767 3017008 5730969 0 522372 CHROMOSOME_MtDNA 13794 4335 1225 2055 6179 0 110 CHROMOSOME_V 20924143 6750393 3712058 3701397 6760295 0 638983 CHROMOSOME_X 17718854 5747199 3119702 3117868 5734085 0 514715 total 100286004 32371753 17781988 17758998 323732650 3131519 # Compare to WS200: +1 in chrII, -3 in chrIII faCount ../../ce7/sanger/UCSC.fa.gz #seq len A C G T N cpg chrI 15072421 4835939 2695879 2692150 4848453 0 503521 chrII 15279324 4878196 2769216 2762198 4869714 0 492149 chrIII 13783682 4444652 2449139 2466321 4423570 0 459669 chrIV 17493784 5711040 3034767 3017008 5730969 0 522372 chrM 13794 4335 1225 2055 6179 0 110 chrV 20924143 6750393 3712058 3701397 6760295 0 638983 chrX 17718854 5747199 3119702 3117868 5734085 0 514715 total 100286002 32371754 17781986 17758997 323732650 3131519 # Fix AGP names: sed -e 's/^/chr/' ../ws210/sequences/dna/CHR*.agp > UCSC.agp # And add a fake mitochondrial AGP entry for the sake of downstream # tools (make sure the GenBank sequence is identical to given): echo -e "chrM\t1\t13794\t1\tF\tNC_001328.1\t1\t13794\t+" >> UCSC.agp ######################################################################### # run the makeGenomeDb procedure to create the db and unmasked sequence # (DONE - 2010-07-08 - Hiram) cd /hive/data/genomes/ce9 cat << '_EOF_' > ce9.config.ra # Config parameters for makeGenomeDb.pl: db ce9 clade worm genomeCladePriority 10 scientificName Caenorhabditis elegans commonName C. elegans assemblyDate Jan 2010 assemblyLabel Washington University School of Medicine GSC and Sanger Institute WS210 assemblyShortLabel WS210 orderKey 823 # mito sequence included in fasta and AGP file mitoAcc none fastaFiles /hive/data/genomes/ce9/sanger/UCSC.fa.gz agpFiles /hive/data/genomes/ce9/sanger/UCSC.agp # qualFiles /dev/null dbDbSpeciesDir worm taxId 6239 '_EOF_' # << emacs mkdir jkStuff # run just to AGP to make sure things are sane first nice -n +19 makeGenomeDb.pl -noGoldGapSplit ce9.config.ra -stop=agp \ > jkStuff/makeGenomeDb.agp.log 2>&1 # now, continuing to make the Db and all time nice -n +19 makeGenomeDb.pl -noGoldGapSplit ce9.config.ra -continue=db \ > jkStuff/makeGenomeDb.db.log 2>&1 # real 1m37.013s # take the trackDb business there and check it into the source tree # fixup the description, gap and gold html page descriptions ######################################################################### # REPEATMASKER (DONE - 2010-07-08 - Hiram) screen # use screen to control the job mkdir /hive/data/genomes/ce9/bed/repeatMasker cd /hive/data/genomes/ce9/bed/repeatMasker time nice -n +19 doRepeatMasker.pl -noSplit -bigClusterHub=pk \ -buildDir=/hive/data/genomes/ce9/bed/repeatMasker ce9 > do.log 2>&1 & # about 103 minutes # from the do.log: # RepeatMasker version development-$Id: # RepeatMasker,v 1.24 2009/06/08 18:07:05 angie Exp $ # CC RELEASE 20090604; * ######################################################################### # SIMPLE REPEATS (DONE - 2010-07-09 - Hiram) ssh kkstore06 screen # use screen to control the job mkdir /hive/data/genomes/ce9/bed/simpleRepeat cd /hive/data/genomes/ce9/bed/simpleRepeat time nice -n +19 doSimpleRepeat.pl -smallClusterHub=memk \ -buildDir=/hive/data/genomes/ce9/bed/simpleRepeat ce9 > do.log 2>&1 & # about 30 minutes ######################################################################### # MASK SEQUENCE WITH RM+TRF (DONE - 2010-07-09 - Hiram) # Since both doRepeatMasker.pl and doSimpleRepeats.pl have completed, # now it's time to combine the masking into the final ce9.2bit, # following the instructions at the end of doSimpleRepeat's output. ssh kolossus cd /hive/data/genomes/ce9 twoBitMask ce9.rmsk.2bit -add bed/simpleRepeat/trfMask.bed ce9.2bit # You can safely ignore the warning about extra BED columns twoBitToFa ce9.2bit stdout | faSize stdin # 100286004 bases (0 N's 100286004 real 86863769 upper 13422235 lower) # in 7 sequences in 1 files # %13.38 masked total, %13.38 masked real # set the symlink on hgwdev to /gbdb/ce9 cd /gbdb/ce9 ln -s /hive/data/genomes/ce9/ce9.2bit . ######################################################################### # MAKE 11.OOC FILE FOR BLAT (DONE - 2010-07-09 - Hiram) # Use -repMatch=100 (based on size -- for human we use 1024, and # worm size is ~3.4% of human judging by gapless ce4 vs. hg18 genome # size from featureBits. So we would use 34, but that yields a very # high number of tiles to ignore, especially for a small more compact # genome. Bump that up a bit to be more conservative. cd /hive/data/genomes/ce9 blat ce9.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=jkStuff/ce9.11.ooc -repMatch=100 # Wrote 8514 overused 11-mers to jkStuff/11.ooc mkdir /hive/data/staging/data/ce9 cd /hive/data/genomes/ce9 cp -p ce9.2bit jkStuff/ce9.11.ooc chrom.sizes /hive/data/staging/data/ce9 # request rsync to cluster nodes ######################################################################### # BLATSERVERS ENTRY (DONE - 2010-07-09 - Hiram) # After getting a blat server assigned by the Blat Server Gods, ssh hgwdev hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("ce9", "blat9", "17778", "1", "0"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("ce9", "blat9", "17779", "0", "1");' \ hgcentraltest # test it with some sequence hgsql -e 'DELETE FROM blatServers where db="ce9";' hgcentraltest # new blat server 19 October hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("ce9", "blatx", "17818", "1", "0"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("ce9", "blatx", "17819", "0", "1");' \ hgcentraltest ############################################################################ # reset default position, same as ce4 on the ZC101 / unc-52 locus ssh hgwdev hgsql -e 'update dbDb set defaultPos="chrII:14646344-14667746" where name="ce9";' hgcentraltest ############################################################################ # GENBANK AUTO UPDATE (DONE - 2010-09-09 - Hiram) # align with latest genbank process. ssh hgwdev cd ~/kent/src/hg/makeDb/genbank git pull # edit etc/genbank.conf to add ce9 just before ce6 # ce9 (C. elegans) ce9.serverGenome = /hive/data/genomes/ce9/ce9.2bit ce9.clusterGenome = /scratch/data/ce9/ce9.2bit ce9.ooc = /scratch/data/ce9/ce9.11.ooc ce9.refseq.mrna.native.pslCDnaFilter = ${finished.refseq.mrna.native.pslCDnaFilter} ce9.refseq.mrna.xeno.pslCDnaFilter = ${finished.refseq.mrna.xeno.pslCDnaFilter} ce9.genbank.mrna.native.pslCDnaFilter = ${finished.genbank.mrna.native.pslCDnaFilter} ce9.genbank.mrna.xeno.pslCDnaFilter = ${finished.genbank.mrna.xeno.pslCDnaFilter} ce9.genbank.est.native.pslCDnaFilter = ${finished.genbank.est.native.pslCDnaFilter} ce9.maxIntron = 100000 ce9.lift = no ce9.genbank.mrna.xeno.load = yes ce9.refseq.mrna.xeno.load = yes ce9.downloadDir = ce9 # ce9.upstreamGeneTbl = refGene # ce9.upstreamMaf = multiz6way /hive/data/genomes/ce9/bed/multiz6way/species.lst git commit -m "Added ce9 C. elegans WS120" etc/genbank.conf git push # update /cluster/data/genbank/: make etc-update ssh genbank screen # use a screen to manage this job cd /cluster/data/genbank time nice -n +19 bin/gbAlignStep -initial ce9 & # logFile: var/build/logs/2010.09.07-11:34:55.ce9.initalign.log # real 118m26.190s # this had a temporary failure: # Cannot allocate memory # can't fork # command failed: gbAlignInstall -noMigrate -workdir=work/initial.ce9/align -orgCats=native genbank.179.0 daily.0826 mrna ce9 at /cluster/genbank/genbank/bin/../lib/gbCommon.pm line 272. at /cluster/genbank/genbank/bin/../lib/gbCommon.pm line 272. # command failed: gbAlignFinish -workdir=work/initial.ce9/align -noMigrate ce9 at /cluster/genbank/genbank/bin/../lib/gbCommon.pm line 272. at /cluster/genbank/genbank/bin/../lib/gbCommon.pm line 272. # continuing time ./etc/align-genbank -finish -workdir work/initial.ce9/align # var/build/logs/2010.09.09-15:09:26.align.log # real 0m47.281s # load database when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad ce9 # logFile: var/dbload/hgwdev/logs/2010.09.09-15:11:20.dbload.log # real 24m52.293s # enable daily alignment and update of hgwdev cd ~/kent/src/hg/makeDb/genbank git pull # add ce9 to: etc/align.dbs etc/hgwdev.dbs git commit -m "Added ce9 - C. elegans WS210" \ etc/align.dbs etc/hgwdev.dbs git push make etc-update ############################################################################### ## RNA sangerGenes (DONE - 2010-09-14 - Hiram) mkdir /hive/data/genomes/ce9/bed/sangerNonCoding cd /hive/data/genomes/ce9/bed/sangerNonCoding # scan out lines matching these column 2 words, then see what column 3 # has to say for the next filter. for C in I II III IV V X M do echo "chr${C}.gff -> test.chr${C}.gff" grep -v "^#" ../sangerGene/chr${C}.gff | awk -F'\t' ' { if (match($2,"^Non_coding_transcript|^miRNA|^ncRNA|^rRNA|^scRNA|^snRNA|^snlRNA|^snoRNA|^tRNA|^tRNAscan-SE-1.23")) { printf "%s\n", $0 } } ' | sort -u | sort -k4n > test.chr${C}.gff done for C in I II III IV V X M do echo "chr${C}.gff -> rna.chr${C}.gff" grep -v "^#" ../sangerGene/chr${C}.gff | sed -e 's/"//g' | awk -F'\t' ' { if (match($2,"^Non_coding_transcript|^miRNA|^ncRNA|^rRNA|^scRNA|^snRNA|^snlRNA|^snoRNA|^tRNA|^tRNAscan-SE-1.23")) { if (match($3,"exon|intron")) { gsub("Transcript ","",$9) gsub(" .*","",$9) for (i = 1; i < 9; ++i) { printf "%s\t", $i } printf "%s\n", $9 } } } ' | sort -u | sort -k4n > rna.chr${C}.gff done # scan the column 2 pseudoGene definitions to see what column 3 says for C in I II III IV V X M do echo "chr${C}.gff -> test.chr${C}.gff" grep -v "^#" ../sangerGene/chr${C}.gff | awk -F'\t' ' { if (match($2,"^Pseudogene")) { printf "%s\n", $0 } } ' | sort -u | sort -k4n > test.chr${C}.gff done for C in I II III IV V X M do echo "chr${C}.gff -> pseudoGene.chr${C}.gff" grep -v "^#" ../sangerGene/chr${C}.gff | sed -e 's/"//g' | awk -F'\t' ' { if (match($2,"^Pseudogene")) { if (match($3,"exon|intron")) { gsub("Pseudogene ","",$9) gsub(" .*","",$9) gsub("\".*","",$9) for (i = 1; i < 9; ++i) { printf "%s\t", $i } printf "%s\n", $9 } } } ' | sort -u | sort -k4n > pseudoGene.chr${C}.gff done cd /hive/data/genomes/ce9/bed/sangerNonCoding ldHgGene -out=sangerRnaGene.gp ce9 sangerRnaGene rna.*.gff # Read 16663 transcripts in 18079 lines in 7 files # 16663 groups 7 seqs 10 sources 2 feature types # 16663 gene predictions ldHgGene ce9 sangerRnaGene rna.*.gff ldHgGene -out=sangerPseudoGene.gp ce9 sangerPseudoGene pseudoGene.*.gff # Read 1565 transcripts in 7919 lines in 7 files # 1565 groups 6 seqs 1 sources 2 feature types # 1565 gene predictions ldHgGene ce9 sangerPseudoGene pseudoGene.*.gff ######################################################################### ## SANGER GENE TRACK (DONE - 2010-09-16 - Hiram) ## There is a tremendous amount of extraneous notations in the gff ## files. Filter them down to a manageable set, change the chrom ## names, eliminate duplicates, select only what will work in ## ldHgGene. Plus there is a lot of confusion in the names, ever since ## the ce4 build. These notes from Angie are still accurate: ## # ADJUST KIMEXPDISTANCE NAMES TO MATCH SANGERCANONICAL (DONE 7/12/07 angie) # kimExpDistance has all [A-Z]+[0-9]+\.[0-9]+ names, but now sangerCanonical # has some of the new [A-Z]+[0-9]+\.[0-9]+\.[0-9]+ names, so when hgNear # looks for a canonical name in kimExpDistance, it often can't find it. # Canonical also has [A-Z]+[0-9]+\.[0-9]+[a-z] # and [A-Z]+[0-9]+\.[0-9]+[a-z]\.[0-9]+ # kimExpDistance has 4 [A-Z]+[0-9]+\.[0-9]+[A-Z] # Update the names in kimExpDistance to match sangerCanonical. mkdir /hive/data/genomes/ce9/bed/sangerGene cd /hive/data/genomes/ce9/bed/sangerGene /hive/data/genomes/ce9/ws210] less genome_feature_tables/GFF2/CHROMOSOME_I.gff for C in I II III IV V X do echo -n "${C} " cat ../../ws210/genome_feature_tables/GFF2/CHROMOSOME_${C}.gff | \ sed -e "s/CHROMOSOME_III/chrIII/g; s/CHROMOSOME_II/chrII/g; \ s/CHROMOSOME_IV/chrIV/g; s/CHROMOSOME_I/chrI/g; \ s/CHROMOSOME_X/chrX/g; s/CHROMOSOME_V/chrV/g; \ s/CHROMOSOME_M/chrM/g;" \ -e 's/Sequence "\(.*\)"$/\1/' -e 's/Transcript "\(.*\)"$/\1/' \ -e 's/CDS "//' -e 's/"//' \ > chr${C}.gff done C=M echo -n "${C} " cat ../../ws210/genome_feature_tables/GFF2/CHROMOSOME_MtDNA.gff | \ sed -e "s/CHROMOSOME_III/chrIII/g; s/CHROMOSOME_II/chrII/g; \ s/CHROMOSOME_IV/chrIV/g; s/CHROMOSOME_I/chrI/g; \ s/CHROMOSOME_X/chrX/g; s/CHROMOSOME_V/chrV/g; \ s/CHROMOSOME_M/chrM/g; s/chrMtDNA/chrM/g;" \ -e 's/Sequence "\(.*\)"$/\1/' -e 's/Transcript "\(.*\)"$/\1/' \ -e 's/CDS "//' -e 's/"//' \ > chr${C}.gff # now filter them to gene definitions # scan column 2 of these guys to decide on the filter here. # After applying this column 2 filter, scan column 3 to see what is # appropriate to filter for column 3. Use that column 3 in the next # scan. for C in I II III IV V X M do echo "chr${C}.gff -> test.chr${C}.gff" grep -v "^#" chr${C}.gff | awk -F'\t' ' { if (match($2,"curated|Coding_transcript")) { printf "%s\n", $0 } } ' | sort -u | sort -k4n > test.chr${C}.gff done if (match($2,"curated|Coding_transcript")) { # now filter them to gene definitions for C in I II III IV V X M do echo "chr${C}.gff -> filtered.chr${C}.gff" grep -v "^#" chr${C}.gff | sed -e 's/"//g' | awk -F'\t' ' { if (match($2,"curated|Coding_transcript")) { if (match($3,"intron|coding_exon|exon|CDS|three_prime_UTR|five_prime_UTR")) { gsub("coding_exon","CDS",$3) gsub("Transcript ","",$9) gsub(" .*","",$9) gsub("three_prime_UTR","3utr",$3) gsub("five_prime_UTR","5utr",$3) for (i = 1; i < 9; ++i) { printf "%s\t", $i } printf "%s\n", $9 } } } ' | sort -u | sort -k4n > filtered.chr${C}.gff done # turn them into genePred files nice -n +19 ldHgGene -nobin ce9 sangerGene filtered.*.gff # Read 31178 transcripts in 1036352 lines in 7 files # 31178 groups 7 seqs 2 sources 5 feature types # 31178 gene predictions nice -n +19 ldHgGene -nobin ce9 sangerGene filtered.*.gff mv genePred.tab filteredGenePred.tab # there are a set of names that are merely CDS definitions # of the alleles, for example: AC3.5 is CDS for AC3.5.1 and AC3.5.2 # we do not need the AC3.5 cat << '_EOF_' > removeNoDotGenes.pl #!/usr/bin/env perl use strict; use warnings; my %fullNames; my %minusDotName; my $done = 0; while (my $line = <>) { chomp $line; die "duplicate name $line" if (exists($fullNames{$line})); my @a = split('\.', $line); die "more than three dots ? $line" if (scalar(@a) > 3); my $shortName = $line; if (3 == scalar(@a)) { $shortName = "$a[0].$a[1]"; if (exists($minusDotName{$shortName})) { $minusDotName{$shortName} .= ",$line"; } else { $minusDotName{$shortName} = $line; } } elsif (2 == scalar(@a)) { $shortName = "$a[0]"; if (exists($minusDotName{$shortName})) { $minusDotName{$shortName} .= ",$line"; } else { $minusDotName{$shortName} = $line; } } $fullNames{$line} = $shortName; # printf "$line -> $shortName\n" if ($done < 10); ++$done; } foreach my $key (keys %fullNames) { if (exists($minusDotName{$key})) { printf "$key\t$minusDotName{$key}\n"; } } '_EOF_' # << happy emacs ./removeNoDotGenes.pl < ucsc.allGenes.list | cut -f 1 \ | sort -u > namesToRemove.list for N in `cat namesToRemove.list` do hgsql -N -e "delete from sangerGene where name=\"${N}\";" ce9 done cat << '_EOF_' > mkSangerLinks.pl #!/usr/bin/env perl use strict; use warnings; my %ucscGeneNames; my %strippedNames; open (FH, ") { chomp $line; $ucscGeneNames{$line} = 1; my @a = split('\.', $line); die "more than three dots ? $line" if (scalar(@a) > 3); my $shortName = $line; if (3 == scalar(@a)) { $shortName = "$a[0].$a[1]"; } elsif (2 == scalar(@a)) { $shortName = "$a[0]"; } $strippedNames{$shortName} = $line; } close (FH); my %geneToProtein; open (FH, ") { chomp $line; my ($gene, $protein) = split('\t',$line); if (exists($ucscGeneNames{$gene})) { if (exists($geneToProtein{$gene})) { printf STDERR "duplicate: $gene $protein $geneToProtein{$gene}\n"; } else { $geneToProtein{$gene} = $protein; } } elsif (exists($strippedNames{$gene})) { # my $shortName = $strippedNames{$gene}; my $shortName = $gene; if (exists($geneToProtein{$shortName})) { printf STDERR "duplicate: $shortName $protein $geneToProtein{$shortName}\n"; } else { $geneToProtein{$shortName} = $protein; # printf "shortName: $line $shortName $protein\n"; } } } close (FH); my $missing = 0; foreach my $key (sort keys %ucscGeneNames) { my @a = split('\.', $key); my $shortName = $key; if (3 == scalar(@a)) { $shortName = "$a[0].$a[1]"; } elsif (2 == scalar(@a)) { $shortName = "$a[0]"; } if (!exists($geneToProtein{$key})) { if (!exists($geneToProtein{$shortName})) { ++$missing; printf STDERR "missing: $key $shortName\n"; $geneToProtein{$key} = $key; } else { $geneToProtein{$key} = $geneToProtein{$shortName}; } } } printf STDERR "missing proteins: $missing\n"; my %genesDone; open (SL,">sangerLinks.tab") or die "can not write to sangerLinks.tab"; open (FH, ") { chomp $line; my ($name, $descr) = split('\t', $line); if (exists($geneToProtein{$name})) { printf SL "%s\t%s\t%s\n", $name, $geneToProtein{$name}, $descr; $genesDone{$name} = 1; } } close(FH); my $noCount = 0; foreach my $key (sort keys %ucscGeneNames) { if (!exists($genesDone{$key})) { printf SL "%s\t%s\t(no description)\n", $key, $key; ++$noCount; } } close(SL); printf STDERR "no description for $noCount genes\n"; '_EOF_' # << happy emacs chmod +x mkSangerLinks.pl ./mkSangerLinks.pl # no description for 150 genes hgLoadSqlTab ce9 sangerLinks ~/kent/src/hg/lib/sangerLinks.sql \ sangerLinks.tab hgsql -N -e "select name from sangerGene;" ce9 | sort > name.sangerGene.txt sort /hive/data/genomes/ce9/ws210/tarFile/gene_name.txt > orfToGene.txt cut -f1 orfToGene.txt | sort > orf.name.list comm -23 name.sangerGene.txt orf.name.list > orf.no.match.list grep "^>" /hive/data/genomes/ce9/ws210/sequences/protein/wormpep.table210 \ | sed -e "s/^>//" | sort > wormpep.table210 join orf.no.match.list wormpep.table210 > orf.missing.txt join name.sangerGene.txt orfToGene.txt > orf.matching.txt cat << '_EOF_' > missingOrf.pl #!/usr/bin/env perl use strict; use warnings; open (PT, ">protein.tab") or die "can not write to protein.tab"; open (FH, ") { chomp $line; my ($geneName, $ceName, $wbGene, $others) = split('\s+',$line, 4); my @other = split('\s+', $others); my $uniProt = ""; my $proteinId = ""; my $locus = ""; my $paragraph = ""; for (my $i = 0; $i < scalar(@other); ++$i) { if ($other[$i] =~ m/^locus:/) { $locus = $other[$i]; $locus =~ s/^locus://; } elsif ($other[$i] =~ m/^UniProt:/) { $uniProt = $other[$i]; $uniProt =~ s/^UniProt://; } elsif ($other[$i] =~ m/^protein_id:/) { $proteinId = $other[$i]; $proteinId =~ s/^protein_id://; } elsif ($other[$i] =~ m/status:/) { next; } else { if (length($paragraph) > 0) { $paragraph .= " $other[$i]"; } else { $paragraph = $other[$i]; } } } if (length($locus) > 0) { printf "%s\t%s\n", $geneName, $locus; } else { printf "%s\t%s\n", $geneName, $geneName; } if (length($proteinId) > 1) { printf PT "%s\t%s\n", $geneName, $proteinId; } if (length($proteinId) > 1) { printf PT "%s\t%s\n", $geneName, $proteinId; } } close (FH); close (PT); '_EOF_' # << happy emacs chmod +x missingOrf.pl ./missingOrf.pl > addTo.orfToGene.txt cat addTo.orfToGene.txt orf.matching.txt | tr '[ ]' '[\t]' \ > fixedOrfToGene.tab hgLoadSqlTab ce9 orfToGene ~/kent/src/hg/lib/orfToGene.sql \ fixedOrfToGene.tab cat << '_EOF_' > geneToProtein.pl #!/usr/bin/env perl use strict; use warnings; open (FH, "sed -e 's/^>//' ../../ws210/sequences/protein/wormpep.table210|") or die "can not read wormpep.table210"; while (my $line = ) { chomp $line; my ($geneName, $ceName, $wbGene, $others) = split('\s+',$line, 4); my @other = split('\s+', $others); my $uniProt = ""; my $proteinId = ""; my $locus = ""; my $paragraph = ""; for (my $i = 0; $i < scalar(@other); ++$i) { if ($other[$i] =~ m/^locus:/) { $locus = $other[$i]; $locus =~ s/^locus://; } elsif ($other[$i] =~ m/^UniProt:/) { $uniProt = $other[$i]; $uniProt =~ s/^UniProt://; } elsif ($other[$i] =~ m/^protein_id:/) { $proteinId = $other[$i]; $proteinId =~ s/^protein_id://; } elsif ($other[$i] =~ m/status:/) { next; } else { if (length($paragraph) > 0) { $paragraph .= " $other[$i]"; } else { $paragraph = $other[$i]; } } } if (length($uniProt) > 0) { printf "%s\t%s\n", $geneName, $uniProt; } elsif (length($proteinId) > 0) { printf "%s\t%s\n", $geneName, $proteinId; } } close (FH); '_EOF_' # << happy emacs chmod +x geneToProtein.pl ./geneToProtein.pl | sort > protein.tab cat ../../ws210/tarFile/chr*.gff | awk '{print $NF}' \ | sort -u > tar.gene.list cat filtered.chr*.gff | awk '{print $NF}' \ | sort -u > ucsc.filtered.gene.list # obtained tar file with lists from Paul Davis # our genes are a subset since we separate non-coding # using what we already have: cut -f1 paragraph.tab | sort > local.paragraph.nameList ln -s /hive/data/genomes/ce9/ws210/tarFile/paragraph.txt ./paragraph.txt sort paragraph.txt > sort.paragraph.txt # selecting out what they have for our business join sort.paragraph.txt local.paragraph.nameList \ | sed -e "s/ /\t/" > join.local.paragraph.txt # Add proteinID field to sangerGene table, used by Gene Sorter hgsql -e 'alter table sangerGene add proteinID varchar(40) NOT NULL;' ce9 # To add index on this column hgsql -e 'alter table sangerGene add index(proteinID);' ce9 # Use SwissProt IDs in sangerLinks table to fill proteinID # column in the sangerGene table hgsql -e 'update sangerGene as g, sangerLinks as l \ set g.proteinID = l.protName where g.name = l.orfName;' ce9 # check there are rows with the protName field filled hgsql -N -e "select proteinId from sangerGene" ce9 | sort | uniq -c | wc -l # 23411 hgPepPred ce9 generic sangerPep ../../ws210/sequences/protein/wormpep210 ############################################################################### ######## MAKING GENE SORTER TABLES #### (DONE - 2010-09-09 - Hiram) ## Before starting this, the sangerPep and names need to be fixed # These are instructions for building the # Gene Sorter browser. Don't start these until # there is a sangerGene table with a proteinID column containing Swiss-Prot # protein IDs, and also Kim lab expression data is required (in hgFixed). mkdir /hive/data/genomes/ce9/bed/geneSorter cd /hive/data/genomes/ce9/bed/geneSorter # Cluster together various alt-splicing isoforms. # Creates the sangerIsoforms and sangerCanonical tables # this creates tables: sangerCanonical sangerIsoforms hgClusterGenes ce9 sangerGene sangerIsoforms sangerCanonical # Got 19952 clusters, from 31290 genes in 7 chromosomes featureBits ce9 sangerCanonical # 58389702 bases of 100286004 (58.223%) in intersection featureBits ce6 sangerCanonical # 57412493 bases of 100281426 (57.251%) in intersection # Create Self blastp table - sangerBlastTab mkdir -p /hive/data/genomes/ce9/bed/blastTab cd /hive/data/genomes/ce9/bed/blastTab pepPredToFa hg19 knownGenePep hg19.known.faa pepPredToFa ce9 sangerPep ce9.sanger.faa cat << '_EOF_' > ce9.config.ra targetGenesetPrefix sanger targetDb ce9 queryDbs hg19 hg19Fa /hive/data/genomes/ce9/bed/blastTab/hg19.known.faa ce9Fa /hive/data/genomes/ce9/bed/blastTab/ce9.sanger.faa buildDir /hive/data/genomes/ce9/bed/blastTab/ce9 scratchDir /hive/data/genomes/ce9/bed/blastTab/ce9/tmp '_EOF_' # << happy emacs mkdir ce9 time nice -n +19 doHgNearBlastp.pl -targetOnly -noLoad -workhorse=hgwdev \ -clusterHub=pk ce9.config.ra > do.log 2>&1 & # real 5m56.823s time ./ce9/run.ce9.ce9/loadPairwise.csh # Scanning through 968 files # Loading database with 1228800 rows # real 0m37.193s cd /hive/data/genomes/ce9/bed/geneSorter # CREATE EXPRESSION DISTANCE TABLES FOR # KIM LAB EXPRESSION DATA - this creates ce9.kimExpDistance time hgExpDistance ce9 hgFixed.kimWormLifeMedianRatio \ hgFixed.kimWormLifeMedianExps kimExpDistance time hgExpDistance -lookup=sangerToRefSeq ce9 \ hgFixed.kimWormLifeMedianRatio \ hgFixed.kimWormLifeMedianExps kimExpDistance # Have 19134 elements in hgFixed.kimWormLifeMedianRatio # Got 19134 unique elements in hgFixed.kimWormLifeMedianRatio # real 1m50.164s # CREATE TABLE TO MAP BETWEEN SANGERGENES AND REFSEQ GENES # sangerToRefSeq hgMapToGene ce9 refGene sangerGene sangerToRefSeq # SET dbDb TABLE ENTRY FOR GENE SORTER # set hgNearOk to 1 on hgcentraltest to make Ce9 Gene Sorter viewable hgsql -e 'update dbDb set hgNearOk = 1 where name = "ce9";' \ hgcentraltest # Running joinerCheck to see what is sane: cd ~/kent/src/hg/makeDb/schema joinerCheck -identifier=wormBaseId -database=ce9 -times -keys ./all.joiner # ce9.kimExpDistance.query - hits 11528085 of 11528085 ok # ce9.kimExpDistance.target - hits 11528085 of 11528085 ok # ce9.orfToGene.name - hits 28125 of 28125 ok # ce9.sangerBlastTab.query - hits 1228800 of 1228800 ok # ce9.sangerBlastTab.target - hits 1228800 of 1228800 ok # ce9.sangerCanonical.transcript - hits 19952 of 19952 ok # ce9.sangerIsoforms.transcript - hits 28152 of 28152 ok # ce9.sangerLinks.orfName - hits 28152 of 28152 ok # ce9.sangerPep.name - hits 28152 of 28152 ok # ce9.sangerToRefSeq.name - hits 27945 of 27945 ok ## create a gene name to WBGene ID reference table ## to be used to construct URL mkdir /hive/data/genomes/ce9/bed/WBGeneID cd /hive/data/genomes/ce9/bed/WBGeneID # There was one duplicate entry in the file, the sort -u eliminates it sed -e "s#http://wormbase.org/db/gene/gene?name=##; s#;class=Gene##" \ ../../downloads/tarFile/url.txt | sort -u > sangerGeneToWBGeneID.tab # add in the one-dot names for those names that are only two-dot names # in this file: cat << '_EOF_' > addDotName.pl #!/usr/bin/env perl use strict; use warnings; my %sangerGeneName; open (FH, "sangerGene.name") or die "can not read sangerGene.name"; while (my $line = ) { chomp $line; $sangerGeneName{$line} = 1; } close (FH); open (FH,") { chomp $line; my ($name, $wbId) = split('\s+',$line); die "duplicate gene name $name $wbId" if (exists($twoDotNames{$name})); if (exists($sangerGeneName{$name})) { $twoDotNames{$name} = $wbId; } } close (FH); my %neededNames; foreach my $key (keys %twoDotNames) { my ($a, $b, $c, $d) = split('\.',$key); my $wbId = $twoDotNames{$key}; die "three dot name found $key $wbId" if (defined($d)); if (defined($c)) { my $oneDot = sprintf "%s.%s", $a, $b; if (!exists($twoDotNames{$oneDot})) { if (!exists($neededNames{$oneDot})) { $neededNames{$oneDot} = $wbId; } } } } foreach my $key (keys %twoDotNames) { printf "%s\t%s\n", $key, $twoDotNames{$key}; } foreach my $key (keys %neededNames) { printf "%s\t%s\n", $key, $neededNames{$key}; } '_EOF_' # << happy emacs chmod +x addDotName.pl hgsql -N -e "select name from sangerGene;" ce9 | sort -u > sangerGene.name ./addDotName.pl > fixed.sangerGeneToWBGeneID.tab hgLoadSqlTab ce9 sangerGeneToWBGeneID \ ~/kent/src/hg/lib/sangerGeneToWBGeneID.sql \ fixed.sangerGeneToWBGeneID.tab cd ~/kent/src/hg/makeDb/schema joinerCheck -identifier=wormBaseId -database=ce9 -times -keys ./all.joiner # After all the table names were fixed: # ce9.kimExpDistance.query - hits 11528085 of 11528085 ok # ce9.kimExpDistance.target - hits 11528085 of 11528085 ok # ce9.orfToGene.name - hits 28125 of 28125 ok # ce9.sangerBlastTab.query - hits 1228800 of 1228800 ok # ce9.sangerBlastTab.target - hits 1228800 of 1228800 ok # ce9.sangerCanonical.transcript - hits 19952 of 19952 ok # ce9.sangerIsoforms.transcript - hits 28152 of 28152 ok # ce9.sangerLinks.orfName - hits 28152 of 28152 ok # ce9.sangerPep.name - hits 28152 of 28152 ok # ce9.sangerToRefSeq.name - hits 27945 of 27945 ok # fixup the Pep names cd /hive/data/genomes/ce9/bed/geneSorter hgsql -N -e "select name from sangerPep;" ce9 | sort -u \ > name.sangerPep.ce9 hgsql -N -e "select name from sangerGene;" ce9 | sort -u \ > name.sangerGene.ce9 comm -13 name.sangerGene.ce9 name.sangerPep.ce9 > toRemove.sangerPep.ce9 wc -l toRemove.sangerPep.ce9 # 3026 toRemove.sangerPep.ce9 # it appears each name with two extensions: name.1.2 # is the cause of this trouble awk -F'.' 'NF == 3' name.sangerGene.ce9 \ | awk -F'.' '{printf "%s.%s\n", $1, $2}' | sort -u | wc -l # 3026 # try duplicating those names in the sangerPep table awk -F'.' 'NF == 3' name.sangerGene.ce9 \ | awk -F'.' '{printf "%s.%s\t%s.%s.%s\n", $1, $2, $1, $2, $3}' \ > name.translation cat << '_EOF_' > replicate.pl #!/usr/bin/env perl use strict; use warnings; open (FH, ") { chomp $line; my ($smallName, $bigName) = split('\t', $line); printf "rm -f $bigName.faa $smallName.faa\n"; printf "hgsql -N -e 'select * from sangerPep where name=\"$smallName\";' ce9 > $smallName.faa\n"; printf "sed -e 's/^$smallName/$bigName/' $smallName.faa > $bigName.faa\n"; printf "hgsql -N -e 'load data local infile \"$bigName.faa\" into table sangerPep;' ce9\n"; printf "rm -f $bigName.faa $smallName.faa\n"; } close (FH); open (FH, ") { chomp $line; my ($smallName, $bigName) = split('\t', $line); printf "hgsql -N -e 'delete from sangerPep where name=\"$smallName\";' ce9\n"; } close (FH); '_EOF_' # << happy emacs chmod +x replicate.pl ./replicate.pl > replicate.sh sh replicate.sh # ce9.orfToGene.name - hits 30296 of 30296 ok # ce9.sangerBlastTab.query - hits 1255312 of 1255312 ok # ce9.sangerBlastTab.target - hits 1255312 of 1255312 ok # ce9.sangerCanonical.transcript - hits 20051 of 20051 ok # ce9.sangerIsoforms.transcript - hits 30296 of 30296 ok # ce9.sangerLinks.orfName - hits 30115 of 30115 ok # ce9.sangerPep.name - hits 23771 of 23771 ok # ce9.sangerToRefSeq.name - hits 29995 of 29995 ok # ce9.sangerGeneToWBGeneID.sangerGene - hits 30115 of 30115 ok ############################################################################### # ADJUST KIMEXPDISTANCE NAMES TO MATCH SANGERCANONICAL # (DONE - 2010-09-16 - Hiram) # This business taken from Angie's fixups in ce4.txt # kimExpDistance has all [A-Z]+[0-9]+\.[0-9]+ names, but now sangerCanonical # has some of the new [A-Z]+[0-9]+\.[0-9]+\.[0-9]+ names, so when hgNear # looks for a canonical name in kimExpDistance, it often can't find it. # Canonical also has [A-Z]+[0-9]+\.[0-9]+[a-z] # and [A-Z]+[0-9]+\.[0-9]+[a-z]\.[0-9]+ # kimExpDistance has 4 [A-Z]+[0-9]+\.[0-9]+[A-Z] # Update the names in kimExpDistance to match sangerCanonical. mkdir /hive/data/genomes/ce9/bed/fixupKim cd /hive/data/genomes/ce9/bed/fixupKim # Look at just the names and how they differ between the two sources: hgsql ce9 -N -e 'select transcript from sangerCanonical' \ | sort > canonicalName.list hgsql ce9 -N -e 'select * from kimExpDistance' > ce9.kimExpDistance awk -F"\t" '{print $1}' ce9.kimExpDistance | sort -u > ce9.kimExpNames sdiff /tmp/canonicalName.list ce9.kimExpNames | less comm -23 canonicalName.list ce9.kimExpNames | wc -l #8539 comm -13 canonicalName.list ce9.kimExpNames | wc -l #7721 comm -12 canonicalName.list ce9.kimExpNames | wc -l #11413 # Pretty significant non-overlap, but I think much of it can be # resolved by slight tweaks to kimExpDistance names so that they # match the latest, more-specifically-versioned canonical transcripts. # Also, kimExpDistance has many names with no match in canonical, # which means many thousands of unnecessary rows bloating the table. cat > fixExpDistance.pl <<'_EOF_' #!/bin/env perl use warnings; use strict; my @kimExpNames; my @kimSpecial; open(FH, ") { chomp; push @kimExpNames, $_; push @kimSpecial, $_ if ($_ =~ /\dA$/); } close(FH); my %kim2canon; open(FH, "canonicalName.list") || die; while () { chomp; my $kim = $_; my @special = grep m/^$kim$/i, @kimSpecial; if (scalar(@special)) { $kim2canon{$special[0]} = $kim; } else { my $simple = $kim; $simple =~ s/^(\w+\.\d+).*$/$1/; $kim2canon{$simple} = $kim; } } close(FH); while (<>) { chomp; my ($query, $target, $distance) = split('\t'); my $canonicalQ = $kim2canon{$query}; my $canonicalT = $kim2canon{$target}; if (defined $canonicalQ && defined $canonicalT) { print "$canonicalQ\t$canonicalT\t$distance\n"; } } '_EOF_' # << emacs chmod a+x fixExpDistance.pl ./fixExpDistance.pl ce9.kimExpDistance > kimExpDistanceFixedNames.tab # Look the file over, grep for some IDs in /tmp/canonicalName.list. # Load into the db as kimExprDistanceTest. It will take a while # because it's still 12M rows (down from 19M). time nice hgLoadSqlTab ce9 kimExpDistanceTest \ ~/kent/src/hg/lib/distance.sql kimExpDistanceFixedNames.tab # real 0m37.241s # Test hgNear with kimExpDistanceTest instead of kimExpDistance. # Don't delete the original table just yet, but do clean it up soon # because it's quite large. hgsql ce9 -e 'rename table kimExpDistance to kimExpDistanceOrig;' hgsql ce9 -e 'rename table kimExpDistanceTest to kimExpDistance;' runJoiner.csh ce9 kimExpDistance # Test hgNear again with kimExpDistance. ############################################################################ ## create a sangerGene name to WBGene ID reference table ## to be used to construct URL mkdir /hive/data/genomes/ce9/bed/WBGeneID cd /hive/data/genomes/ce9/bed/WBGeneID hgsql -N -e "select name from sangerGene;" ce9 | sort -u \ > ce9.sangerGene.name cut -f1,3 ../../ws210/sequences/protein/wormpep.table210 \ | sed -e "s/^>//" | awk '{printf "%s\t%s\n", $1, $2}' \ | sort > geneToWBId.tab sed -e "s#http://wormbase.org/db/gene/gene?name=##; s#;class=Gene##"\ ../../ws210/tarFile/url.txt | sort > sangerGeneToWBGeneID.tab sort -u geneToWBId.tab sangerGeneToWBGeneID.tab > two.lists.geneToWBid.tab cat << '_EOF_' > addNames.pl #!/bin/env perl use strict; use warnings; my %sangerGenes; my %shortNames; open (FH, ") { chomp $fullName; $sangerGenes{$fullName} = 1; my @a = split('\.', $fullName); my $shortName = $fullName; if (scalar(@a) == 3) { $shortName = "$a[0].$a[1]"; } elsif (scalar(@a) == 2) { $shortName = "$a[0]"; } if (exists($shortNames{$shortName})) { $shortNames{$shortName} .= ",$fullName"; } else { $shortNames{$shortName} = "$fullName"; } } close (FH); open (FH, ") { chomp $line; my ($name, $wbId) = split('\t', $line); if (exists($sangerGenes{$name})) { printf "%s\t%s\n", $name, $wbId; } elsif (exists($shortNames{$name})) { my @a = split(',', $shortNames{$name}); for (my $i = 0; $i < scalar(@a); ++$i) { printf "%s\t%s\n", $a[$i], $wbId; } } } close (FH); '_EOF_' # << happy emacs chmod +x addNames.pl ./addNames.pl | sort -u > fixedGeneToWBGeneID.tab hgLoadSqlTab ce9 sangerGeneToWBGeneID \ ~/kent/src/hg/lib/sangerGeneToWBGeneID.sql \ fixedGeneToWBGeneID.tab cd ~/kent/src/hg/makeDb/schema joinerCheck -identifier=wormBaseId -database=ce9 -times -keys ./all.joiner # ce9.kimExpDistance.query - hits 11528085 of 11528085 ok # ce9.kimExpDistance.target - hits 11528085 of 11528085 ok # ce9.orfToGene.name - hits 28125 of 28125 ok # ce9.sangerBlastTab.query - hits 1228800 of 1228800 ok # ce9.sangerBlastTab.target - hits 1228800 of 1228800 ok # ce9.sangerCanonical.transcript - hits 19952 of 19952 ok # ce9.sangerIsoforms.transcript - hits 28152 of 28152 ok # ce9.sangerLinks.orfName - hits 28152 of 28152 ok # ce9.sangerPep.name - hits 28152 of 28152 ok # ce9.sangerToRefSeq.name - hits 27945 of 27945 ok # ce9.sangerGeneToWBGeneID.sangerGene - hits 28152 of 28152 ok ############################################################################ ## Blastz SELF (DONE - 2010-09-17 - Hiram) # screen mkdir /hive/data/genomes/ce9/bed/lastzCe9.2010-09-17 cd /hive/data/genomes/ce9/bed/lastzCe9.2010-09-17 cat << '_EOF_' > DEF # ce9 vs ce9 - Self alignment BLASTZ_H=2000 BLASTZ_M=200 # TARGET: elegans Ce9 SEQ1_DIR=/scratch/data/ce9/ce9.2bit SEQ1_LEN=/scratch/data/ce9/chrom.sizes SEQ1_CHUNK=1000000 SEQ1_LAP=10000 # QUERY: elegans Ce9 SEQ2_DIR=/scratch/data/ce9/ce9.2bit SEQ2_LEN=/scratch/data/ce9/chrom.sizes SEQ2_CHUNK=1000000 SEQ2_LAP=0 BASE=/hive/data/genomes/ce9/bed/lastzCe9.2010-09-17 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -chainMinScore=10000 -chainLinearGap=medium -bigClusterHub=pk \ -smallClusterHub=memk > do.log 2>&1 & # real 30m2.833s nice -n +19 featureBits ce9 chainSelfLink > fb.ce9.chainSelfLink.txt 2>&1 # 30911955 bases of 100286004 (30.824%) in intersection ########################################################################## ## LASTZ cb3 (DONE - 2010-09-20 - Hiram) screen # use screen to control the job mkdir /hive/data/genomes/ce9/bed/blastzCb3.2010-09-20 cd /hive/data/genomes/ce9/bed/blastzCb3.2010-09-20 cat << '_EOF_' > DEF # ce9 vs cb3 BLASTZ_H=2000 BLASTZ_M=50 # TARGET: elegans Ce9 SEQ1_DIR=/scratch/data/ce9/ce9.2bit SEQ1_LEN=/scratch/data/ce9/chrom.sizes SEQ1_CHUNK=1000000 SEQ1_LAP=10000 # QUERY: C. briggsae cb3 SEQ2_DIR=/scratch/data/cb3/cb3.2bit SEQ2_LEN=/hive/data/genomes/cb3/chrom.sizes SEQ2_CHUNK=1000000 SEQ2_LIMIT=50 SEQ2_LAP=0 BASE=/hive/data/genomes/ce9/bed/blastzCb3.2010-09-20 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -bigClusterHub=pk -smallClusterHub=memk \ > do.log 2>&1 & # real 13m52.946s cat fb.ce9.chainCb3Link.txt # 42421395 bases of 100286004 (42.300%) in intersection # syntenic mapping time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -continue=syntenicNet -syntenicNet \ -workhorse=hgwdev -bigClusterHub=pk -smallClusterHub=memk \ > synNet.log 2>&1 & # real 0m39.019s # swap, this is also in cb3.txt mkdir /hive/data/genomes/cb3/bed/blastz.ce9.swap cd /hive/data/genomes/cb3/bed/blastz.ce9.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/ce9/bed/blastzCb3.2010-09-20/DEF \ -workhorse=hgwdev -bigClusterHub=pk -smallClusterHub=memk \ -swap > swap.log 2>&1 & # real 2m48.539s cat fb.cb3.chainCe9Link.txt # 43115973 bases of 108433446 (39.763%) in intersection time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/ce9/bed/blastzCb3.2010-09-20/DEF \ -workhorse=hgwdev -bigClusterHub=pk -smallClusterHub=memk \ -continue=syntenicNet -syntenicNet -swap > synNet.log 2>&1 & # real 0m39.439s ############################################################################ ## LASTZ caeRem3 (DONE - 2010-09-20 - Hiram) screen # use screen to control the job mkdir /hive/data/genomes/ce9/bed/blastzCaeRem3.2010-09-20 cd /hive/data/genomes/ce9/bed/blastzCaeRem3.2010-09-20 cat << '_EOF_' > DEF # ce9 vs caeRem3 BLASTZ_H=2000 BLASTZ_M=50 # TARGET: elegans ce9 SEQ1_DIR=/scratch/data/ce9/ce9.2bit SEQ1_LEN=/scratch/data/ce9/chrom.sizes SEQ1_CHUNK=1000000 SEQ1_LAP=10000 # QUERY: C. PB2801 caeRem3 SEQ2_DIR=/scratch/data/caeRem3/caeRem3.2bit SEQ2_LEN=/scratch/data/caeRem3/chrom.sizes SEQ2_CTGDIR=/scratch/data/caeRem3/caeRem3.supercontigs.2bit SEQ2_CTGLEN=/scratch/data/caeRem3/caeRem3.supercontigs.sizes SEQ2_LIFT=/scratch/data/caeRem3/caeRem3.chrUn.lift SEQ2_CHUNK=1000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/ce9/bed/blastzCaeRem3.2010-09-20 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -qRepeats=windowmaskerSdust -bigClusterHub=pk \ -workhorse=hgwdev -smallClusterHub=memk > do.log 2>&1 & # real 24m19.320s cat fb.ce9.chainCaeRem3Link.txt # 41841184 bases of 100286004 (41.722%) in intersection # syntenic mapping time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -continue=syntenicNet -syntenicNet \ -qRepeats=windowmaskerSdust -bigClusterHub=pk \ -workhorse=hgwdev -smallClusterHub=memk > synNet.log 2>&1 & # real 2m48.968s # swap, this is also in caeRem3.txt mkdir /hive/data/genomes/caeRem3/bed/blastz.ce9.swap cd /hive/data/genomes/caeRem3/bed/blastz.ce9.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -workhorse=hgwdev -qRepeats=windowmaskerSdust \ /hive/data/genomes/ce9/bed/blastzCaeRem3.2010-09-20/DEF \ -bigClusterHub=pk -smallClusterHub=memk -swap > swap.log 2>&1 & # real 2m11.777s cat fb.caeRem3.chaince9Link.txt # 46320775 bases of 138406388 (33.467%) in intersection # syntenic mapping time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -workhorse=hgwdev -qRepeats=windowmaskerSdust \ /hive/data/genomes/ce9/bed/blastzCaeRem3.2010-09-20/DEF \ -continue=syntenicNet -syntenicNet \ -bigClusterHub=pk -smallClusterHub=memk -swap > synNet.log 2>&1 & # real 0m38.257s ############################################################################ ## LASTZ caePb2 (DONE - 2010-09-20 - Hiram) ssh kkstore06 screen # use screen to control the job mkdir /hive/data/genomes/ce9/bed/blastzCaePb2.2010-09-20 cd /hive/data/genomes/ce9/bed/blastzCaePb2.2010-09-20 cat << '_EOF_' > DEF # ce9 vs caePb2 BLASTZ_H=2000 BLASTZ_M=50 # TARGET: elegans Ce9 SEQ1_DIR=/scratch/data/ce9/ce9.2bit SEQ1_LEN=/scratch/data/ce9/chrom.sizes SEQ1_CHUNK=1000000 SEQ1_LAP=10000 # QUERY: C. PB2801 caePb2 SEQ2_DIR=/scratch/data/caePb2/caePb2.2bit SEQ2_LEN=/scratch/data/caePb2/chrom.sizes SEQ2_CTGDIR=/scratch/data/caePb2/caePb2.supercontigs.2bit SEQ2_CTGLEN=/scratch/data/caePb2/caePb2.supercontigs.sizes SEQ2_LIFT=/scratch/data/caePb2/caePb2.supercontigs.lift SEQ2_CHUNK=1000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/ce9/bed/blastzCaePb2.2010-09-20 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -qRepeats=windowmaskerSdust -bigClusterHub=pk \ -workhorse=hgwdev -smallClusterHub=memk > do.log 2>&1 & # real 35m14.835s cat fb.ce9.chainCaePb2Link.txt # 40793086 bases of 100286004 (40.677%) in intersection # syntenic mapping time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -continue=syntenicNet -syntenicNet \ -qRepeats=windowmaskerSdust -bigClusterHub=pk \ -workhorse=hgwdev -smallClusterHub=memk > synNet.log 2>&1 & # real 3m21.202s # swap, this is also in caePb2.txt mkdir /hive/data/genomes/caePb2/bed/blastz.ce9.swap cd /hive/data/genomes/caePb2/bed/blastz.ce9.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/ce9/bed/blastzCaePb2.2010-09-20/DEF \ -workhorse=hgwdev -qRepeats=windowmaskerSdust \ -bigClusterHub=pk -smallClusterHub=memk -swap > swap.log 2>&1 & # real 2m36.529s cat fb.caePb2.chainCe9Link.txt # 55084649 bases of 170473138 (32.313%) in intersection time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/ce9/bed/blastzCaePb2.2010-09-20/DEF \ -workhorse=hgwdev -qRepeats=windowmaskerSdust \ -continue=syntenicNet -syntenicNet \ -bigClusterHub=pk -smallClusterHub=memk -swap > synNet.log 2>&1 & # real 0m42.368s ############################################################################ ## LASTZ caeJap1 (DONE - 2010-09-20 - Hiram) # these tables were dropped, the caeJap3 is more appropriate screen # use screen to control the job mkdir /hive/data/genomes/ce9/bed/blastzCaeJap1.2010-09-20 cd /hive/data/genomes/ce9/bed/blastzCaeJap1.2010-09-20 cat << '_EOF_' > DEF # ce9 vs caeJap1 BLASTZ_H=2000 BLASTZ_M=50 # TARGET: elegans Ce9 SEQ1_DIR=/scratch/data/ce9/ce9.2bit SEQ1_LEN=/scratch/data/ce9/chrom.sizes SEQ1_CHUNK=1000000 SEQ1_LAP=10000 # QUERY: C. japonica caeJap1 SEQ2_DIR=/scratch/data/caeJap1/caeJap1.2bit SEQ2_LEN=/scratch/data/caeJap1/chrom.sizes SEQ2_CTGDIR=/scratch/data/caeJap1/caeJap1.TrfWM.supers.2bit SEQ2_CTGLEN=/scratch/data/caeJap1/caeJap1.TrfWM.supers.sizes SEQ2_LIFT=/scratch/data/caeJap1/caeJap1.chrUn.lift SEQ2_CHUNK=1000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/ce9/bed/blastzCaeJap1.2010-09-20 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -bigClusterHub=pk \ -qRepeats=windowmaskerSdust -smallClusterHub=memk > do.log 2>&1 & # real 16m16.650s cat fb.ce9.chainCaeJap1Link.txt # 26876526 bases of 100286004 (26.800%) in intersection time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -bigClusterHub=pk \ -continue=syntenicNet -syntenicNet \ -qRepeats=windowmaskerSdust -smallClusterHub=memk > synNet.log 2>&1 & # real 0m13.124s # swap, this is also in caeJap1.txt mkdir /hive/data/genomes/caeJap1/bed/blastz.ce9.swap cd /hive/data/genomes/caeJap1/bed/blastz.ce9.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/ce9/bed/blastzCaeJap1.2010-09-20/DEF \ -workhorse=hgwdev -qRepeats=windowmaskerSdust \ -bigClusterHub=pk -smallClusterHub=memk -swap > swap.log 2>&1 & # real 1m18.062s cat fb.caeJap1.chainCe9Link.txt # 26200691 bases of 129347181 (20.256%) in intersection ############################################################################ ## LASTZ melHap1 (DONE - 2010-09-22 - Hiram) screen # use screen to control the job mkdir /hive/data/genomes/ce9/bed/blastzMelHap1.2010-09-22 cd /hive/data/genomes/ce9/bed/blastzMelHap1.2010-09-22 cat << '_EOF_' > DEF # ce9 vs melHap1 BLASTZ_H=2000 BLASTZ_M=50 # TARGET: elegans Ce9 SEQ1_DIR=/scratch/data/ce9/ce9.2bit SEQ1_LEN=/scratch/data/ce9/chrom.sizes SEQ1_CHUNK=1000000 SEQ1_LAP=10000 # QUERY: C. briggsae melHap1 SEQ2_DIR=/scratch/data/melHap1/melHap1.2bit SEQ2_LEN=/scratch/data/melHap1/chrom.sizes SEQ2_CHUNK=1000000 SEQ2_LIMIT=50 SEQ2_LAP=0 BASE=/hive/data/genomes/ce9/bed/blastzMelHap1.2010-09-22 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -qRepeats=windowmaskerSdust -bigClusterHub=pk \ -workhorse=hgwdev -smallClusterHub=memk \ > do.log 2>&1 & # real 12m35.686s cat fb.ce9.chainMelHap1Link.txt # 3933301 bases of 100286004 (3.922%) in intersection # swap, this is also in melHap1.txt mkdir /hive/data/genomes/melHap1/bed/blastz.ce9.swap cd /hive/data/genomes/melHap1/bed/blastz.ce9.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/ce9/bed/blastzMelHap1.2010-09-22/DEF \ -qRepeats=windowmaskerSdust -bigClusterHub=pk \ -workhorse=hgwdev -smallClusterHub=memk -swap > swap.log 2>&1 & # real 0m23.154s cat fb.melHap1.chainCe9Link.txt # 3631046 bases of 53017507 (6.849%) in intersection ############################################################################ ## LASTZ melInc1 (DONE - 2010-09-22 - Hiram) screen # use screen to control the job mkdir /hive/data/genomes/ce9/bed/blastzMelInc1.2010-09-22 cd /hive/data/genomes/ce9/bed/blastzMelInc1.2010-09-22 cat << '_EOF_' > DEF # ce9 vs melInc1 BLASTZ_H=2000 BLASTZ_M=50 # TARGET: elegans Ce9 SEQ1_DIR=/scratch/data/ce9/ce9.2bit SEQ1_LEN=/scratch/data/ce9/chrom.sizes SEQ1_CHUNK=1000000 SEQ1_LAP=10000 # QUERY: C. briggsae melInc1 SEQ2_DIR=/scratch/data/melInc1/melInc1.2bit SEQ2_LEN=/scratch/data/melInc1/chrom.sizes SEQ2_CHUNK=1000000 SEQ2_LIMIT=50 SEQ2_LAP=0 BASE=/hive/data/genomes/ce9/bed/blastzMelInc1.2010-09-22 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -qRepeats=windowmaskerSdust -bigClusterHub=pk \ -workhorse=hgwdev -smallClusterHub=memk \ > do.log 2>&1 & # real 28m11.154s cat fb.ce9.chainMelInc1Link.txt # 3310201 bases of 100286004 (3.301%) in intersection # swap, this is also in melInc1.txt mkdir /hive/data/genomes/melInc1/bed/blastz.ce9.swap cd /hive/data/genomes/melInc1/bed/blastz.ce9.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/ce9/bed/blastzMelInc1.2010-09-22/DEF \ -qRepeats=windowmaskerSdust -bigClusterHub=pk \ -workhorse=hgwdev -smallClusterHub=memk -swap > swap.log 2>&1 & # real 0m36.095s cat fb.melInc1.chainCe9Link.txt # 4240790 bases of 82095019 (5.166%) in intersection ############################################################################ ## LASTZ bruMal1 (DONE - 2010-09-22 - Hiram) screen # use screen to control the job mkdir /hive/data/genomes/ce9/bed/blastzBruMal1.2010-09-22 cd /hive/data/genomes/ce9/bed/blastzBruMal1.2010-09-22 cat << '_EOF_' > DEF # ce9 vs bruMal1 BLASTZ_H=2000 BLASTZ_M=50 # TARGET: elegans Ce9 SEQ1_DIR=/scratch/data/ce9/ce9.2bit SEQ1_LEN=/scratch/data/ce9/chrom.sizes SEQ1_CHUNK=1000000 SEQ1_LAP=10000 # QUERY: C. briggsae bruMal1 SEQ2_DIR=/scratch/data/bruMal1/bruMal1.2bit SEQ2_LEN=/scratch/data/bruMal1/chrom.sizes SEQ2_CHUNK=1000000 SEQ2_LIMIT=50 SEQ2_LAP=0 BASE=/hive/data/genomes/ce9/bed/blastzBruMal1.2010-09-22 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -qRepeats=windowmaskerSdust -bigClusterHub=pk \ -workhorse=hgwdev -smallClusterHub=memk \ > do.log 2>&1 & # real 60m27.833s cat fb.ce9.chainBruMal1Link.txt # 4804043 bases of 100286004 (4.790%) in intersection # swap, this is also in bruMal1.txt mkdir /hive/data/genomes/bruMal1/bed/blastz.ce9.swap cd /hive/data/genomes/bruMal1/bed/blastz.ce9.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/ce9/bed/blastzBruMal1.2010-09-22/DEF \ -qRepeats=windowmaskerSdust -bigClusterHub=pk \ -workhorse=hgwdev -smallClusterHub=memk -swap > swap.log 2>&1 & # real 19m54.633s cat fb.bruMal1.chainCe9Link.txt # 4869447 bases of 89235536 (5.457%) in intersection ############################################################################ ## LASTZ caeJap3 (DONE - 2010-09-22 - Hiram) screen # use screen to control the job mkdir /hive/data/genomes/ce9/bed/blastzCaeJap3.2010-09-22 cd /hive/data/genomes/ce9/bed/blastzCaeJap3.2010-09-22 cat << '_EOF_' > DEF # ce9 vs caeJap3 BLASTZ_H=2000 BLASTZ_M=50 # TARGET: elegans Ce9 SEQ1_DIR=/scratch/data/ce9/ce9.2bit SEQ1_LEN=/scratch/data/ce9/chrom.sizes SEQ1_CHUNK=1000000 SEQ1_LAP=10000 # QUERY: C. briggsae caeJap3 SEQ2_DIR=/scratch/data/caeJap3/caeJap3.2bit SEQ2_LEN=/scratch/data/caeJap3/chrom.sizes SEQ2_CHUNK=1000000 SEQ2_LIMIT=50 SEQ2_LAP=0 BASE=/hive/data/genomes/ce9/bed/blastzCaeJap3.2010-09-22 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -qRepeats=windowmaskerSdust -bigClusterHub=pk \ -workhorse=hgwdev -smallClusterHub=memk \ > do.log 2>&1 & # real 67m37.407s cat fb.ce9.chainCaeJap3Link.txt # 27658992 bases of 100286004 (27.580%) in intersection # swap, this is also in caeJap3.txt mkdir /hive/data/genomes/caeJap3/bed/blastz.ce9.swap cd /hive/data/genomes/caeJap3/bed/blastz.ce9.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/ce9/bed/blastzCaeJap3.2010-09-22/DEF \ -qRepeats=windowmaskerSdust -bigClusterHub=pk \ -workhorse=hgwdev -smallClusterHub=memk \ -swap > swap.log 2>&1 & # real 5m37.231s cat fb.caeJap3.chainCe9Link.txt # 29986108 bases of 154057934 (19.464%) in intersection ############################################################################ ## LASTZ priPac1 (DONE - 2010-09-23 - Hiram) screen # use screen to control the job mkdir /hive/data/genomes/ce9/bed/blastzPriPac1.2010-09-23 cd /hive/data/genomes/ce9/bed/blastzPriPac1.2010-09-23 cat << '_EOF_' > DEF # ce9 vs priPac1 BLASTZ_H=2000 BLASTZ_M=50 # TARGET: elegans Ce9 SEQ1_DIR=/scratch/data/ce9/ce9.2bit SEQ1_LEN=/scratch/data/ce9/chrom.sizes SEQ1_CHUNK=1000000 SEQ1_LAP=10000 # QUERY: C. briggsae priPac1 SEQ2_DIR=/scratch/data/priPac1/priPac1.2bit SEQ2_LEN=/scratch/data/priPac1/chrom.sizes SEQ2_CTGDIR=/hive/data/genomes/priPac1/trfWM.maskedContigs/priPac1.TrfWM.supers.2bit SEQ2_CTGLEN=/hive/data/genomes/priPac1/trfWM.maskedContigs/priPac1.TrfWM.supers.sizes SEQ2_LIFT=/hive/data/genomes/priPac1/downloads/super.lift SEQ2_CHUNK=1000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/ce9/bed/blastzPriPac1.2010-09-23 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -verbose=2 -bigClusterHub=pk \ -qRepeats=windowmaskerSdust -smallClusterHub=memk > do.log 2>&1 & # real 14m20.785s cat fb.ce9.chainPriPac1Link.txt # 6251889 bases of 100286004 (6.234%) in intersection # swap, this is also in priPac1.txt mkdir /hive/data/genomes/priPac1/bed/blastz.ce9.swap cd /hive/data/genomes/priPac1/bed/blastz.ce9.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/ce9/bed/blastzPriPac1.2010-09-23/DEF \ -qRepeats=windowmaskerSdust -bigClusterHub=pk -smallClusterHub=memk \ -swap > swap.log 2>&1 & # real 0m43.464s cat fb.priPac1.chainCe9Link.txt # 6799226 bases of 145948246 (4.659%) in intersection ############################################################################ ## LASTZ haeCon1 (DONE - 2010-09-22 - Hiram) screen # use screen to control the job mkdir /hive/data/genomes/ce9/bed/blastzHaeCon1.2010-09-23 cd /hive/data/genomes/ce9/bed/blastzHaeCon1.2010-09-23 cat << '_EOF_' > DEF # ce9 vs haeCon1 BLASTZ_H=2000 BLASTZ_M=50 # TARGET: elegans Ce9 SEQ1_DIR=/scratch/data/ce9/ce9.2bit SEQ1_LEN=/scratch/data/ce9/chrom.sizes SEQ1_CHUNK=1000000 SEQ1_LAP=10000 # QUERY: C. briggsae haeCon1 SEQ2_DIR=/scratch/data/haeCon1/haeCon1.2bit SEQ2_LEN=/scratch/data/haeCon1/chrom.sizes SEQ2_CHUNK=1000000 SEQ2_LIMIT=50 SEQ2_LAP=0 BASE=/hive/data/genomes/ce9/bed/blastzHaeCon1.2010-09-23 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -qRepeats=windowmaskerSdust -bigClusterHub=pk \ -workhorse=hgwdev -smallClusterHub=memk \ > do.log 2>&1 & # real 62m3.845s cat fb.ce9.chainHaeCon1Link.txt # 7484042 bases of 100286004 (7.463%) in intersection # recip best to see how it looks time doRecipBest.pl -workhorse=hgwdev -verbose=2 -buildDir=`pwd` \ ce9 haeCon1 > rbest.log 2>&1 & # real 4m5.408s # swap, this is also in haeCon1.txt mkdir /hive/data/genomes/haeCon1/bed/blastz.ce9.swap cd /hive/data/genomes/haeCon1/bed/blastz.ce9.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/ce9/bed/blastzHaeCon1.2010-09-23/DEF \ -qRepeats=windowmaskerSdust -bigClusterHub=pk \ -workhorse=hgwdev -smallClusterHub=memk -swap > swap.log 2>&1 & # real 7m35s cat fb.haeCon1.chainCe9Link.txt # 8860569 bases of 278844984 (3.178%) in intersection ############################################################################ ## 10-Way Multiz (DONE - 2010-10-15 - Hiram) mkdir /hive/data/genomes/ce9/bed/multiz10way cd /hive/data/genomes/ce9/bed/multiz10way # starting with the 10way tree created from 44 way tree cat << '_EOF_' > 10way.nh ((((((ce9:0.003,((cb3:0.005,caeRem3:0.003):0.004,caePb2:0.013):0.002):0.001,caeJap3:0.023):0.04,haeCon1:0.06):0.06,priPac2:0.12):0.06,(melHap1:0.14,melInc1:0.15):0.03):0.06,bruMal1:0.24); '_EOF_' # << happy emacs sed -e "s/ce9/C._elegans/; s/cb3/C._briggsae/; s/caeRem3/C._remanei/; s/caePb2/C._brenneri/; s/caeJap3/C._japonica/; s/haeCon1/H._contortus/; s/priPac2/P._pacificus/; s/melHap1/M._hapla/; s/melInc1/M._incognita/; s/bruMal1/B._malayi/;" 10way.nh > commonNames.nh # Use this commonNames specification in the phyloGif tool: # http://genome.ucsc.edu/cgi-bin/phyloGif # to obtain a gif image for htdocs/images/phylo/ce9_10way.gif /cluster/bin/phast/all_dists 10way.nh > 10way.distances.txt # Use this output to create the table below, with this perl script: cat << '_EOF_' > sizeStats.pl #!/usr/bin/env perl use strict; use warnings; open (FH, "grep -y ce9 10way.distances.txt | sort -k3,3n|") or die "can not read 10way.distances.txt"; my $count = 0; while (my $line = ) { chomp $line; my ($ce9, $D, $dist) = split('\s+', $line); my $chain = "chain" . ucfirst($D); my $B="/hive/data/genomes/ce9/bed/lastz.$D/fb.ce9." . $chain . "Link.txt"; my $chainLinkMeasure = `awk '{print \$5}' ${B} 2> /dev/null | sed -e "s/(//; s/)//"`; chomp $chainLinkMeasure; $chainLinkMeasure = 0.0 if (length($chainLinkMeasure) < 1); $chainLinkMeasure =~ s/\%//; my $swapFile="/hive/data/genomes/${D}/bed/blastz.ce9.swap/fb.${D}.chainCe9Link.txt"; my $swapMeasure = "N/A"; if ( -s $swapFile ) { $swapMeasure = `awk '{print \$5}' ${swapFile} 2> /dev/null | sed -e "s/(//; s/)//"`; chomp $swapMeasure; $swapMeasure = 0.0 if (length($swapMeasure) < 1); $swapMeasure =~ s/\%//; } my $orgName= `hgsql -N -e "select organism from dbDb where name='$D';" hgcentraltest`; chomp $orgName; if (length($orgName) < 1) { $orgName="N/A"; } ++$count; if ($swapMeasure eq "N/A") { printf "# %02d %.4f - %s %s\t(%% %.3f) (%s)\n", $count, $dist, $orgName, $D, $chainLinkMeasure, $swapMeasure } else { printf "# %02d %.4f - %s %s\t(%% %.3f) (%% %.3f)\n", $count, $dist, $orgName, $D, $chainLinkMeasure, $swapMeasure } } close (FH); '_EOF_' # << happy emacs chmod +x ./sizeStats.pl ./sizeStats.pl # # If you can fill in all the numbers in this table, you are ready for # the multiple alignment procedure # # featureBits chainLink measures # chainCe9Link chain linearGap # distance on ce9 on other minScore # 01 0.0120 - C. remanei caeRem3 (% 41.722) (% 33.467) # 02 0.0140 - C. briggsae cb3 (% 42.300) (% 39.763) # 03 0.0180 - C. brenneri caePb2 (% 40.677) (% 32.313) # 04 0.0270 - C. japonica caeJap3 (% 27.580) (% 19.464) # 05 0.1040 - H. contortus haeCon1 (% 7.463) (% 3.178) # 06 0.2240 - P. pacificus priPac2 (% 6.234) (% 4.659) # 06 0.2240 - P. pacificus priPac2 (% 5.862) (% 4.692) # 07 0.3340 - M. hapla melHap1 (% 3.922) (% 6.849) # 08 0.3440 - M. incognita melInc1 (% 3.301) (% 5.166) # 09 0.4640 - B. malayi bruMal1 (% 4.790) (% 5.457) # create species list and stripped down tree for autoMZ sed -e "s/:[0-9]\.[0-9][0-9]*//g; s/;//" 10way.nh > tmp.nh echo `cat tmp.nh` > tree-commas.nh echo `cat tree-commas.nh` | sed 's/ //g; s/,/ /g' > tree.nh sed 's/[()]//g; s/,/ /g' tree.nh > species.list cd /hive/data/genomes/ce9/bed/multiz10way # bash shell syntax here ... export H=/hive/data/genomes/ce9/bed mkdir mafLinks # verify the alignment type is correct, watch the output of this: for G in `sed -e "s/ce9 //" species.list` do mkdir mafLinks/$G if [ -s ${H}/lastz.${G}/mafRBestNet/chrI.maf.gz ]; then echo " # $G - recipBest" ln -s ${H}/lastz.$G/mafRBestNet/*.maf.gz ./mafLinks/$G else if [ -s ${H}/lastz.${G}/mafSynNet/chrI.maf.gz ]; then echo " # $G - synNet" ln -s ${H}/lastz.$G/mafSynNet/*.maf.gz ./mafLinks/$G else if [ -s ${H}/lastz.${G}/mafNet/chrI.maf.gz ]; then echo " # $G - mafNet" ln -s ${H}/lastz.$G/mafNet/*.maf.gz ./mafLinks/$G else echo "missing directory lastz.${G}/*Net" fi fi fi done # cb3 - synNet # caeRem3 - synNet # caePb2 - synNet # caeJap3 - synNet # haeCon1 - recipBest # priPac2 - mafNet # melHap1 - mafNet # melInc1 - mafNet # bruMal1 - mafNet # the autoMultiz cluster run # most interesting, this would not work on memk # on memk autoMZ somehow got the tree.nh full of back slashes ? ssh swarm cd /hive/data/genomes/ce9/bed/multiz10way/ mkdir maf run cd run mkdir penn cp -p /cluster/bin/penn/multiz.2008-11-25/multiz penn cp -p /cluster/bin/penn/multiz.2008-11-25/maf_project penn cp -p /cluster/bin/penn/multiz.2008-11-25/autoMZ penn # set the db and pairs directories here cat > autoMultiz.csh << '_EOF_' #!/bin/csh -ef set db = ce9 set c = $1 set result = $2 set run = `/bin/pwd` set tmp = /scratch/tmp/$db/multiz.$c set pairs = /hive/data/genomes/ce9/bed/multiz10way/mafLinks /bin/rm -fr $tmp /bin/mkdir -p $tmp /bin/cp -p ../tree.nh ../species.list $tmp pushd $tmp > /dev/null foreach s (`/bin/sed -e "s/ $db//" species.list`) set in = $pairs/$s/$c.maf set out = $db.$s.sing.maf if (-e $in.gz) then /bin/zcat $in.gz > $out if (! -s $out) then /bin/echo "##maf version=1 scoring=autoMZ" > $out endif else if (-e $in) then /bin/ln -s $in $out else /bin/echo -e "##maf version=1 scoring=autoMZ\n\n##eof maf" > $out endif end set path = ($run/penn $path); rehash $run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf \ > /dev/null popd > /dev/null /bin/rm -f $result /bin/cp -p $tmp/$c.maf $result /bin/rm -fr $tmp /bin/rmdir --ignore-fail-on-non-empty /scratch/tmp/$db '_EOF_' # << happy emacs chmod +x autoMultiz.csh cat << '_EOF_' > template #LOOP ./autoMultiz.csh $(root1) {check out line+ /hive/data/genomes/ce9/bed/multiz10way/maf/$(root1).maf} #ENDLOOP '_EOF_' # << happy emacs find ../mafLinks -type l | sed -e "s/.*chr/chr/; s/.maf.gz//" \ | sort -u > chr.part.list ssh memk cd /hive/data/genomes/ce9/bed/multiz10way/run gensub2 chr.part.list single template jobList para create jobList para try para time # Completed: 7 of 7 jobs # CPU time in finished jobs: 672s 11.20m 0.19h 0.01d 0.000 y # IO & Wait Time: 87s 1.45m 0.02h 0.00d 0.000 y # Average job time: 108s 1.81m 0.03h 0.00d # Longest finished job: 132s 2.20m 0.04h 0.00d # Submission to last job: 145s 2.42m 0.04h 0.00d # load tables for a look ssh hgwdev mkdir -p /gbdb/ce9/multiz10way/maf cd /hive/data/genomes/ce9/bed/multiz10way/maf ln -s `pwd`/*.maf /gbdb/ce9/multiz10way/maf # this generates an immense multiz10way.tab file in the directory # where it is running. Best to run this over in scratch. cd /data/tmp time nice -n +19 hgLoadMaf \ -pathPrefix=/gbdb/ce9/multiz10way/maf ce9 multiz10way # Loaded 427736 mafs in 7 files from /gbdb/ce9/multiz10way/maf # real 0m10.362s # load summary table time nice -n +19 cat /gbdb/ce9/multiz10way/maf/*.maf \ | hgLoadMafSummary ce9 -minSize=10000 -verbose=2 \ -mergeGap=500 -maxSize=50000 multiz10waySummary stdin # Created 161206 summary blocks from 1535167 components # and 427736 mafs from stdin # real 0m10.960s featureBits ce9 multiz10waySummary # 61377768 bases of 100286004 (61.203%) in intersection ############################################################################## # 10-way Gap annotation (DONE - 2010-09-24 - Hiram) # Gap Annotation # prepare bed files with gap info mkdir /hive/data/genomes/ce9/bed/multiz10way/anno cd /hive/data/genomes/ce9/bed/multiz10way/anno mkdir maf run # most of these will already exist from previous multiple alignments # remove the echo from in front of the twoBitInfo command to get them # to run if this loop appears to be correct for DB in `cat ../species.list` do CDIR="/hive/data/genomes/${DB}" if [ ! -f ${CDIR}/${DB}.N.bed ]; then echo "creating ${DB}.N.bed" echo twoBitInfo -nBed ${CDIR}/${DB}.2bit ${CDIR}/${DB}.N.bed else ls -og ${CDIR}/${DB}.N.bed fi done cd run rm -f nBeds sizes for DB in `sed -e "s/ce9 //" ../../species.list` do echo "${DB} " ln -s /hive/data/genomes/${DB}/${DB}.N.bed ${DB}.bed echo ${DB}.bed >> nBeds ln -s /hive/data/genomes/${DB}/chrom.sizes ${DB}.len echo ${DB}.len >> sizes done # the annotation step requires large memory, run on memk nodes ssh memk cd /hive/data/genomes/ce9/bed/multiz10way/anno/run ls ../../maf | sed -e "s/.maf//" > chr.list cat << '_EOF_' > template #LOOP ./anno.csh $(root1) {check out line+ ../maf/$(root1).maf} #ENDLOOP '_EOF_' # << happy emacs cat << '_EOF_' > anno.csh #!/bin/csh -fe set inMaf = ../../maf/$1.maf set outMaf = ../maf/$1.maf rm -f $outMaf mafAddIRows -nBeds=nBeds $inMaf /hive/data/genomes/ce9/ce9.2bit $outMaf '_EOF_' # << happy emacs chmod +x anno.csh ssh pk cd /hive/data/genomes/ce9/bed/multiz10way/anno/run gensub2 chr.list single template jobList para create jobList para try # Completed: 7 of 7 jobs # CPU time in finished jobs: 25s 0.41m 0.01h 0.00d 0.000 y # IO & Wait Time: 51s 0.86m 0.01h 0.00d 0.000 y # Average job time: 11s 0.18m 0.00h 0.00d # Longest finished job: 13s 0.22m 0.00h 0.00d # Submission to last job: 22s 0.37m 0.01h 0.00d ssh hgwdev rm -fr /gbdb/ce9/multiz10way/maf mkdir /gbdb/ce9/multiz10way/maf cd /hive/data/genomes/ce9/bed/multiz10way/anno/maf ln -s `pwd`/*.maf /gbdb/ce9/multiz10way/maf/ # by loading this into the table multiz10way, it will replace the # previously loaded table with the unannotated mafs # huge temp files are made, do them on local disk cd /data/tmp time nice -n +19 hgLoadMaf \ -pathPrefix=/gbdb/ce9/multiz10way/maf ce9 multiz10way # Loaded 480735 mafs in 7 files from /gbdb/ce9/multiz10way/maf # real 0m13.295s time nice -n +19 cat /gbdb/ce9/multiz10way/maf/*.maf \ | hgLoadMafSummary ce9 -minSize=10000 -verbose=2 \ -mergeGap=500 -maxSize=50000 multiz10waySummary stdin # Created 161206 summary blocks from 1535167 components and 480735 mafs # from stdin # real 0m17.228s ############################################################################## # Phylogenetic tree from 10-way (DONE - 2010-09-24 - Hiram) # Extract 4-fold degenerate sites based on # of sangerGenes, coding mkdir /hive/data/genomes/ce9/bed/multiz10way/4d cd /hive/data/genomes/ce9/bed/multiz10way/4d hgsql ce9 -Ne "select * from sangerGene;" | cut -f1-10 > sangerGene.gp wc -l sangerGene.gp # 28152 sangerGene.gp genePredSingleCover sangerGene.gp stdout | sort > sangerGeneNR.gp wc -l sangerGeneNR.gp # 20227 sangerGeneNR.gp ssh memk mkdir /hive/data/genomes/ce9/bed/multiz10way/4d/run cd /hive/data/genomes/ce9/bed/multiz10way/4d/run mkdir ../mfa # whole chrom mafs version, using new version of # uses memory-efficient version of phast, from Melissa Hubisz at Cornell # mjhubisz at gmail.com cat << '_EOF_' > 4d.csh #!/bin/csh -fe set r = "/hive/data/genomes/ce9/bed/multiz10way" set c = $1 set infile = $r/maf/$2 set outfile = $3:t cd /scratch/tmp # 'clean' maf perl -wpe 's/^s ([^.]+)\.\S+/s $1/' $infile > $c.maf awk -v C=$c '$2 == C {print}' $r/4d/sangerGeneNR.gp > $c.gp set PHASTBIN=/cluster/bin/phast.2008-12-18 $PHASTBIN/msa_view --4d --features $c.gp --do-cats 3 -i MAF $c.maf -o SS > $c.ss $PHASTBIN/msa_view -i SS --tuple-size 1 $c.ss > $r/4d/mfa/$outfile rm -f $c.gp $c.maf $c.ss '_EOF_' # << happy emacs chmod +x 4d.csh ls /hive/data/genomes/ce9/bed/multiz10way/maf > maf.list cat << '_EOF_' > template #LOOP 4d.csh $(root1) $(path1) {check out line+ ../mfa/$(root1).mfa} #ENDLOOP '_EOF_' # << happy emacs gensub2 maf.list single template jobList rm -fr /cluster/data/ce9/bed/multiz10way/4d/mfa mkdir /cluster/data/ce9/bed/multiz10way/4d/mfa para create jobList para try para check para time # Completed: 7 of 7 jobs # CPU time in finished jobs: 126s 2.10m 0.03h 0.00d 0.000 y # IO & Wait Time: 37s 0.62m 0.01h 0.00d 0.000 y # Average job time: 23s 0.39m 0.01h 0.00d # Longest finished job: 34s 0.57m 0.01h 0.00d # Submission to last job: 47s 0.78m 0.01h 0.00d # combine mfa files cd .. sed -e "s/ /,/g" ../species.list > species.lst /cluster/bin/phast/msa_view --aggregate `cat species.lst` mfa/*.mfa | \ sed s/"> "/">"/ > 4d.all.mfa # use phyloFit to create tree model (output is phyloFit.mod) export PHASTBIN=/cluster/bin/phast.2008-12-18 time $PHASTBIN/phyloFit --EM --precision MED --msa-format FASTA \ --subst-mod REV --tree ../tree-commas.nh 4d.all.mfa # real 32m37.505s # third try with MAF nets for all: # TREE: ((((((ce9:0.533224,((cb3:0.466081,caeRem3:0.426495):0.048821,caePb2:0.469105):0.095490):0.130228,caeJap3:0.607132):0.203809,haeCon1:1.132838):0.127990,priPac2:0.791549):0.492695,(melHap1:0.219730,melInc1:0.208707):0.656773):0.606056,bruMal1:0.606056); # second one with priPac2: (synNets for 5 cae*, recipBest for haeCon1) # TREE: ((((((ce9:0.494072,((cb3:0.459335,caeRem3:0.411553):0.050715,caePb2:0.455175):0.112394):0.115572,caeJap3:0.524552):0.126211,haeCon1:1.207917):0.101818,priPac2:0.852081):0.424500,(melHap1:0.215350,melInc1:0.215677):0.651712):0.631468,bruMal1:0.631468); # first one with priPac1: # TREE: ((((((ce9:0.493597,((cb3:0.459180,caeRem3:0.411537):0.051063,caePb2:0.455061):0.112749):0.117493,caeJap3:0.522910):0.127938,haeCon1:1.203176):0.098413,priPac1:0.857529):0.428754,(melHap1:0.218602,melInc1:0.211924):0.643495):0.633675,bruMal1:0.633675); mv phyloFit.mod phyloFit.all.mod grep TREE phyloFit.all.mod | sed 's/TREE\:\ //' > tree_4d.10way.nh sed -e 's/,haeCon1.*//' species.lst > caenorhabditis.list $PHASTBIN/tree_doctor \ --prune-all-but=`cat caenorhabditis.list` \ tree_4d.10way.nh > tree_4d.10way.caenorhabditis.nh $PHASTBIN/tree_doctor \ --prune-all-but=`cat caenorhabditis.list` \ ../tree-commas.nh | sed -e "s#:0.000000##g" \ > tree_commas.caenorhabditis.nh cat << '_EOF_' > trimCae.pl #!/bin/env perl use warnings; use strict; my $file = shift; my $matchCae = 0; open (FH, "<$file") or die "can not read $file"; while (my $line = ) { chomp $line; if ($line =~ m/^> /) { if ($line =~ m/^> ce9/) { $matchCae = 1; } elsif ($line =~ m/^> cb3/) { $matchCae = 1; } elsif ($line =~ m/^> caePb2/) { $matchCae = 1; } elsif ($line =~ m/^> caeRem3/) { $matchCae = 1; } elsif ($line =~ m/^> caeJap3/) { $matchCae = 1; } else { $matchCae = 0; } } if ($matchCae) { printf "%s\n", $line; } } '_EOF_' # << happy emacs chmod +x trimCae.pl for C in I II III IV V X M do ./trimCae.pl mfa/chr${C}.mfa > cae.mfa/chr${C}.mfa done $PHASTBIN/msa_view \ --aggregate=`cat caenorhabditis.list` cae.mfa/chr*.mfa \ | sed -e "s/> />/" > 4d.caenorhabditis.mfa time $PHASTBIN/phyloFit --EM --precision MED --msa-format FASTA \ --subst-mod REV --tree tree_commas.caenorhabditis.nh \ 4d.caenorhabditis.mfa # real 0m3.603s # real 0m5.503s mv phyloFit.mod phyloFit.caenorhabditis.mod # TREE: ((ce9:0.534619,((cb3:0.466369,caeRem3:0.424946):0.053704,caePb2:0.466644):0.096947):0.369828,caeJap3:0.369828); # second trial with synNet for 5 cae* # TREE: ((ce9:0.495177,((cb3:0.460305,caeRem3:0.410943):0.052233,caePb2:0.453726):0.117143):0.318229,caeJap3:0.318229); ######################################################################### # CREATE CONSERVATION WIGGLE WITH PHASTCONS - 10-WAY # (DONE - 2010-09-27 - Hiram) # this 10-way alignment did not allow the tuning loop to function. # The phyloBoot procedure did not complete, even on a single chromosome # after three days running time. Falling back to similar procedures # use in other large alignments, hg19 46-way, mm9 30-way, calJac3 13-way # We need the fasta sequence for this business mkdir /hive/data/genomes/ce9/bed/multiz10way/ce9Fasta cd /hive/data/genomes/ce9/bed/multiz10way/ce9Fasta for C in `cut -f1 ../../../chrom.sizes` do echo "twoBitToFa -seq=${C} ../../../ce9.2bit ${C}.fa" twoBitToFa -seq=${C} ../../../ce9.2bit ${C}.fa done # verify nothing lost, should be the same as previously: faSize *.fa # 100286004 bases (0 N's 100286004 real 86863769 upper # 13422235 lower) in 7 sequences in 7 files # %13.38 masked total, %13.38 masked real mkdir /hive/data/genomes/ce9/bed/multiz10way/cons # create 6,000,000 window sized SS files: mkdir /hive/data/genomes/ce9/bed/multiz10way/cons/ss1M cd /hive/data/genomes/ce9/bed/multiz10way/cons/ss1M time for C in `cut -f1 /hive/data/genomes/ce9/chrom.sizes` do mkdir ${C} echo msa_split $C /cluster/bin/phast.build/cornellCVS/phast.2010-08-09/bin/msa_split \ /hive/data/genomes/ce9/bed/multiz10way/maf/${C}.maf \ --refseq /hive/data/genomes/ce9/bed/multiz10way/ce9Fasta/${C}.fa \ --in-format MAF --windows 6000000,0 --between-blocks 5000 \ --out-format SS --out-root ${C}/${C} done # real 0m58.679s mkdir /hive/data/genomes/ce9/bed/multiz10way/cons/msa.split cd /hive/data/genomes/ce9/bed/multiz10way/cons/msa.split export \ MSA_SPLIT="/cluster/bin/phast.build/cornellCVS/phast.2010-08-09/bin/msa_split" for C in I II III IV V X M do echo chr${C} mkdir chr${C} ${MSA_SPLIT} ../../maf/chr${C}.maf --in-format MAF --out-format SS \ --refseq ../../ce9Fasta/chr${C}.fa \ --out-root chr${C}/chr${C} --windows 1000000,0 \ --between-blocks 5000 done # Run phastCons # This job is I/O intensive in its output files, beware where this # takes place or do not run too many at once. ssh pk mkdir /hive/data/genomes/ce9/bed/multiz10way/cons/run.cons cd /hive/data/genomes/ce9/bed/multiz10way/cons/run.cons # there are going to be several different phastCons runs using # this same script. They trigger off of the current working directory # $cwd:t which is the "grp" in this script. cat << '_EOF_' > doPhast.csh #!/bin/csh -fe set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-08-09/bin set c = $1 set cX = $1:r set f = $2 set len = $3 set cov = $4 set rho = $5 set grp = $cwd:t set cons = /hive/data/genomes/ce9/bed/multiz10way/cons set tmp = $cons/tmp/$f /bin/mkdir -p $tmp set ssSrc = $cons set useGrp = "$grp.mod" if (-s $cons/$grp/$grp.non-inf) then /bin/ln -s $cons/$grp/$grp.non-inf $tmp endif /bin/ln -s $cons/$grp/$grp.mod $tmp /bin/ln -s $ssSrc/msa.split/$cX/$f.ss $tmp pushd $tmp > /dev/null if (-s $grp.non-inf) then $PHASTBIN/phastCons $f.ss $useGrp \ --rho $rho --expected-length $len --target-coverage $cov --quiet \ --not-informative `cat $grp.non-inf` \ --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp else $PHASTBIN/phastCons $f.ss $useGrp \ --rho $rho --expected-length $len --target-coverage $cov --quiet \ --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp endif popd > /dev/null /bin/mkdir -p pp/$c bed/$c /bin/sleep 4 /bin/touch pp/$c bed/$c /bin/rm -f pp/$c/$f.pp /bin/rm -f bed/$c/$f.bed /bin/mv $tmp/$f.pp pp/$c /bin/mv $tmp/$f.bed bed/$c /bin/rm -fr $tmp '_EOF_' # << happy emacs chmod a+x doPhast.csh # this template will serve for all runs # root1 == chrom name, file1 == ss file name without .ss suffix # --expected-length 15 --target-coverage 0.55 --rho = 0.3 cat << '_EOF_' > template #LOOP ../run.cons/doPhast.csh $(root1) $(file1) 15 0.55 0.3 {check out line+ pp/$(root1)/$(file1).pp} #ENDLOOP '_EOF_' # << happy emacs ls ../msa.split/chr* | grep ".ss$" | sed -e "s/.ss$//" > ss.list # Create parasol batch and run it # run for all species cd /hive/data/genomes/ce9/bed/multiz10way/cons mkdir -p all cd all # Using the .mod tree cp -p ../../4d/phyloFit.all.mod all.mod gensub2 ../run.cons/ss.list single ../run.cons/template jobList para create jobList para try ... check ... push ... etc. # do not let these run full blast, they run too fast, 20 at a time is fine para -maxJob=20 push # Completed: 104 of 104 jobs # CPU time in finished jobs: 411s 6.85m 0.11h 0.00d 0.000 y # IO & Wait Time: 680s 11.33m 0.19h 0.01d 0.000 y # Average job time: 10s 0.17m 0.00h 0.00d # Longest finished job: 14s 0.23m 0.00h 0.00d # Submission to last job: 62s 1.03m 0.02h 0.00d # create Most Conserved track ssh hgwdev cd /hive/data/genomes/ce9/bed/multiz10way/cons/all cat bed/*/*.bed | sort -k1,1 -k2,2n \ | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \ > tmpMostConserved.bed /cluster/bin/scripts/lodToBedScore tmpMostConserved.bed > mostConserved.bed # load into database hgLoadBed ce9 phastConsElements10way mostConserved.bed # Loaded 339396 elements of size 5 # all sequences at MAF net alignment type, featureBits ce9 -enrichment sangerGene:cds mostConserved.bed # --expected-length 15 --target-coverage 0.55 --rho = 0.2 # sangerGene:cds 25.328%, mostConserved.bed 37.092%, # both 17.507%, cover 69.12%, enrich 1.86x # --expected-length 15 --target-coverage 0.55 --rho = 0.3 # sangerGene:cds 25.328%, mostConserved.bed 44.379%, # both 19.527%, cover 77.10%, enrich 1.74x # Try for 70% CDS cover # this same business in hg19 tried for phastConsElementsNway of %5 # but this nematode business covers a lot more featureBits ce9 -enrichment sangerGene:cds phastConsElements10way # sangerGene:cds 25.328%, phastConsElements10way 38.607%, # both 17.822%, cover 70.37%, enrich 1.82x # in ce6 6-way this was: # --expected-length 15 --target-coverage 0.55 # sangerGene:cds 25.329%, phastConsElements6way.bed 29.557%, # both 15.469%, cover 61.07%, enrich 2.07x # Create merged posterier probability file and wiggle track data # files cd /hive/data/genomes/ce9/bed/multiz10way/cons/all mkdir downloads for C in I II III IV V X M do find ./pp/chr${C} -type f | sed -e "s#^./##" | sed -e "s/\./_/" \ | sort -t'_' -k1,1 -k2,2n | sed -e "s/_/./" done | xargs cat | gzip > downloads/phastCons10way.wigFix.gz # encode those files into wiggle data zcat downloads/phastCons10way.wigFix.gz \ | wigEncode stdin phastCons10way.wig phastCons10way.wib # Converted stdin, upper limit 1.00, lower limit 0.00 du -hsc phastCons10way.wi? # 55M phastCons10way.wib # 8.4M phastCons10way.wig # 63M total # Load gbdb and database with wiggle. ln -s `pwd`/phastCons10way.wib /gbdb/ce9/multiz10way/phastCons10way.wib hgLoadWiggle -pathPrefix=/gbdb/ce9/multiz10way ce9 \ phastCons10way phastCons10way.wig # Create histogram to get an overview of all the data ssh hgwdev cd /hive/data/genomes/ce9/bed/multiz10way/cons/all time nice -n +19 hgWiggle -doHistogram -db=ce9 \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ phastCons10way > histogram.data 2>&1 # real 0m4.189s # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small size 1000,600 x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Nematode Ce9 Histogram phastCons10way track" set xlabel " phastCons10way score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & ######################################################################## # (DONE - 2010-10-06 - Hiram) cd /hive/data/genomes/ce9/bed/multiz10way/cons mkdir caenorhabditis cd caenorhabditis # Using the .mod tree cp -p ../../4d/phyloFit.caenorhabditis.mod caenorhabditis.mod # list the others so they can be on the non-informative list for phastCons echo "haeCon1,priPac2,melHap1,melInc1,bruMal1" > caenorhabditis.non-inf gensub2 ../run.cons/ss.list single ../run.cons/template jobList para create jobList para -maxJob=20 push # Completed: 104 of 104 jobs # CPU time in finished jobs: 400s 6.67m 0.11h 0.00d 0.000 y # IO & Wait Time: 723s 12.05m 0.20h 0.01d 0.000 y # Average job time: 11s 0.18m 0.00h 0.00d # Longest finished job: 14s 0.23m 0.00h 0.00d # Submission to last job: 73s 1.22m 0.02h 0.00d # create Most Conserved track ssh hgwdev cd /hive/data/genomes/ce9/bed/multiz10way/cons/caenorhabditis cat bed/*/*.bed | sort -k1,1 -k2,2n \ | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \ > tmpMostConserved.bed /cluster/bin/scripts/lodToBedScore tmpMostConserved.bed > mostConserved.bed # load into database hgLoadBed ce9 phastConsElements10wayCaenorhabditis mostConserved.bed # Loaded 798287 elements of size 5 featureBits ce9 -enrichment sangerGene:cds mostConserved.bed # ordinary MAF/net --expected-length 15 --target-coverage 0.55 --rho = 0.2 # sangerGene:cds 25.328%, mostConserved.bed 37.128%, # both 17.735%, cover 70.02%, enrich 1.89x # see what this looks like compared to the all species above: featureBits ce9 -enrichment sangerGene:cds \ phastConsElements10wayCaenorhabditis # sangerGene:cds 25.328%, phastConsElements10wayCaenorhabditis 37.524%, # both 17.349%, cover 68.50%, enrich 1.83x # all species above was: # sangerGene:cds 25.328%, phastConsElements10way 38.607%, # both 17.822%, cover 70.37%, enrich 1.82x # Create merged posterier probability file and wiggle track data # files cd /hive/data/genomes/ce9/bed/multiz10way/cons/caenorhabditis mkdir downloads for C in I II III IV V X M do find ./pp/chr${C} -type f | sed -e "s#^./##" | sed -e "s/\./_/" \ | sort -t'_' -k1,1 -k2,2n | sed -e "s/_/./" done | xargs cat | gzip > downloads/phastCons10way.caenorhabditis.wigFix.gz # encode those files into wiggle data zcat downloads/phastCons10way.caenorhabditis.wigFix.gz \ | wigEncode stdin phastCons10wayCaenorhabditis.wig \ phastCons10wayCaenorhabditis.wib # Converted stdin, upper limit 1.00, lower limit 0.00 du -hsc phastCons10wayCaenorhabditis.wi? # 55M phastCons10wayCaenorhabditis.wib # 9.6M phastCons10wayCaenorhabditis.wig # 65M total # Load gbdb and database with wiggle. ln -s `pwd`/phastCons10wayCaenorhabditis.wib \ /gbdb/ce9/multiz10way/phastCons10wayCaenorhabditis.wib hgLoadWiggle -pathPrefix=/gbdb/ce9/multiz10way ce9 \ phastCons10wayCaenorhabditis phastCons10wayCaenorhabditis.wig # Create histogram to get an overview of all the data ssh hgwdev cd /hive/data/genomes/ce9/bed/multiz10way/cons/caenorhabditis time nice -n +19 hgWiggle -doHistogram -db=ce9 \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ phastCons10wayCaenorhabditis > histogram.data 2>&1 # real 0m4.189s # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small size 1000,600 x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Nematode Ce9 Histogram phastCons10wayCaenorhabditis track" set xlabel " phastCons10wayCaenorhabditis score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & ############################################################################ # phyloP conservation (DONE - 2010-10-06 - Hiram) # run phyloP with score=LRT ssh swarm mkdir /hive/data/genomes/ce9/bed/multiz10way/consPhyloP cd /hive/data/genomes/ce9/bed/multiz10way/consPhyloP mkdir run.phyloP cd run.phyloP # Adjust model file base composition background and rate matrix to be # representative of the chromosomes in play grep BACKGROUND ../../cons/all/all.mod | awk '{printf "%0.3f\n", $3 + $4}' # 0.360 /cluster/bin/phast.build/cornellCVS/phast.2010-08-09/bin/modFreqs \ ../../cons/all/all.mod 0.360 > all.mod grep BACKGROUND ../../cons/caenorhabditis/caenorhabditis.mod \ | awk '{printf "%0.3f\n", $3 + $4}' # 0.342 /cluster/bin/phast.build/cornellCVS/phast.2010-08-09/bin/modFreqs \ ../../cons/caenorhabditis/caenorhabditis.mod 0.342 \ > caenorhabditis.mod cat << '_EOF_' > doPhyloP.csh #!/bin/csh -fe set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-08-09/bin set f = $1 set out = $2 set cName = $f:r:r set chrDir = $f:r set n = $f:r:e set grp = $cwd:t set cons = /hive/data/genomes/ce9/bed/multiz10way/consPhyloP set tmp = $cons/tmp/$grp/$f /bin/rm -fr $tmp /bin/mkdir -p $tmp set ssSrc = /hive/data/genomes/ce9/bed/multiz10way/cons/msa.split/$cName/$f.ss set useGrp = "$grp.mod" /bin/ln -s $cons/run.phyloP/$grp.mod $tmp pushd $tmp > /dev/null $PHASTBIN/phyloP --method LRT --mode CONACC --wig-scores --chrom $cName \ -i SS $useGrp $ssSrc.ss > $f.wigFix popd > /dev/null /bin/mkdir -p $out:h sleep 4 mv $tmp/$f.wigFix $out rm -fr $tmp '_EOF_' # << happy emacs chmod +x doPhyloP.csh # Create list of chunks find ../../cons/msa.split -type f | grep ".ss$" \ | sed -e "s#../../cons/msa.split/##" | sed -e "s/.ss//" > ss.list # Create template file # file1 == $chr/$chunk/file name without .ss suffix cat << '_EOF_' > template #LOOP ../run.phyloP/doPhyloP.csh $(file1) {check out line+ wigFix/$(dir1)/$(file1).wigFix} #ENDLOOP '_EOF_' # << happy emacs mkdir /hive/data/genomes/ce9/bed/multiz10way/consPhyloP/all cd /hive/data/genomes/ce9/bed/multiz10way/consPhyloP/all gensub2 ../run.phyloP/ss.list single ../run.phyloP/template jobList para create jobList para -maxJob=20 push para time # Completed: 104 of 104 jobs # CPU time in finished jobs: 1292s 21.54m 0.36h 0.01d 0.000 y # IO & Wait Time: 726s 12.09m 0.20h 0.01d 0.000 y # Average job time: 19s 0.32m 0.01h 0.00d # Longest finished job: 30s 0.50m 0.01h 0.00d # Submission to last job: 114s 1.90m 0.03h 0.00d mkdir downloads for C in I II III IV V X M do find ./wigFix/chr${C} -type f | sed -e "s#^./##" | sed -e "s/\./_/" \ | sort -t'_' -k1,1 -k2,2n | sed -e "s/_/./" done | xargs cat | gzip > downloads/phyloP10way.wigFix.gz wigEncode downloads/phyloP10way.wigFix.gz phyloP10way.wig phyloP10way.wib # upper limit 4.36, lower limit -1.70 # loading the wiggle table: ln -s `pwd`/phyloP10way.wib /gbdb/ce9/multiz10way time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/ce9/multiz10way ce9 \ phyloP10way phyloP10way.wig # Create histogram to get an overview of all the data ssh hgwdev cd /hive/data/genomes/ce9/bed/multiz10way/consPhyloP/all time nice -n +19 hgWiggle -doHistogram -db=ce9 \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ phyloP10way > histogram.data 2>&1 # real 0m3.859s # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small size 1000,600 x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Nematode Ce9 Histogram phyloP10way track" set xlabel " phyloP10way score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & ########################################################################## # phyloP on the caenorhabditis subset (DONE - 2010-10-07 - Hiram) mkdir /hive/data/genomes/ce9/bed/multiz10way/consPhyloP/caenorhabditis cd /hive/data/genomes/ce9/bed/multiz10way/consPhyloP/caenorhabditis gensub2 ../run.phyloP/ss.list single ../run.phyloP/template jobList para create jobList para -maxJob=20 push para time # Completed: 104 of 104 jobs # CPU time in finished jobs: 612s 10.21m 0.17h 0.01d 0.000 y # IO & Wait Time: 697s 11.61m 0.19h 0.01d 0.000 y # Average job time: 13s 0.21m 0.00h 0.00d # Longest finished job: 19s 0.32m 0.01h 0.00d # Submission to last job: 78s 1.30m 0.02h 0.00d mkdir downloads for C in I II III IV V X M do find ./wigFix/chr${C} -type f | sed -e "s#^./##" | sed -e "s/\./_/" \ | sort -t'_' -k1,1 -k2,2n | sed -e "s/_/./" done | xargs cat | gzip > downloads/phyloP10way.caenorhabditis.wigFix.gz wigEncode downloads/phyloP10way.caenorhabditis.wigFix.gz \ phyloP10wayCaenorhabditis.wig phyloP10wayCaenorhabditis.wib # upper limit 1.95, lower limit -1.03 # loading the wiggle table: ln -s `pwd`/phyloP10wayCaenorhabditis.wib /gbdb/ce9/multiz10way time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/ce9/multiz10way ce9 \ phyloP10wayCaenorhabditis phyloP10wayCaenorhabditis.wig # real 0m1.327s # Create histogram to get an overview of all the data ssh hgwdev cd /hive/data/genomes/ce9/bed/multiz10way/consPhyloP/caenorhabditis time nice -n +19 hgWiggle -doHistogram -db=ce9 \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ phyloP10wayCaenorhabditis > histogram.data 2>&1 # real 0m3.864s # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small size 1000,600 x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Nematode Ce9 Histogram phyloP10wayCaenorhabditis track" set xlabel " phyloP10wayCaenorhabditis score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.04] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & ############################################################################ # Constructing Downloads (DONE - 2010-10-07 - Hiram) cd /hive/data/genomes/ce9 makeDownloads.pl -dbHost=hgwdev -workhorse=hgwdev -verbose=2 ce9 \ > downloads.log 2>&1 cd goldenPath mkdir multiz10way phastCons10way phyloP10way cd multiz10way ln -s ../../bed/multiz10way/10way.downloads.nh 10way.nh ln -s ../../bed/multiz10way/10way.commonNames.downloads.nh \ 10way.commonNames.nh cd ../phyloP10way ln -s \ ../../bed/multiz10way/consPhyloP/all/downloads/phyloP10way.wigFix.gz . ln -s \ ../../bed/multiz10way/consPhyloP/caenorhabditis/downloads/phyloP10way.caenorhabditis.wigFix.gz . ln -s ../../bed/multiz10way/consPhyloP/run.phyloP/all.mod ./nematode.mod ln -s ../../bed/multiz10way/consPhyloP/run.phyloP/caenorhabditis.mod ./caenorhabditis.mod cd ../phastCons10way ln -s ../../bed/multiz10way/cons/all/downloads/phastCons10way.wigFix.gz . ln -s \ ../../bed/multiz10way/cons/caenorhabditis/downloads/phastCons10way.caenorhabditis.wigFix.gz . ln -s ../../bed/multiz10way/cons/all/all.mod ./nematode.mod ln -s ../../bed/multiz10way/cons/caenorhabditis/caenorhabditis.mod . cd ../multiz10way #!/bin/sh for S in 1000 2000 5000 do echo "making upstream${S}.maf" nice -n +19 featureBits -verbose=2 ce9 \ sangerGene:upstream:${S} -fa=/dev/null -bed=stdout \ | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \ | mafFrags ce9 multiz10way stdin stdout \ -orgs=../../bed/multiz10way/species.list \ | gzip -c > sangerGene.upstream${S}.maf.gz echo "done sangerGene.upstream${S}.maf.gz" done ############################################################################ ## LASTZ priPac2 (DONE - 2010-09-29 - Hiram) screen # use screen to control the job mkdir /hive/data/genomes/ce9/bed/blastzPriPac2.2010-09-29 cd /hive/data/genomes/ce9/bed/blastzPriPac2.2010-09-29 cat << '_EOF_' > DEF # ce9 vs priPac2 BLASTZ_H=2000 BLASTZ_M=50 # TARGET: elegans Ce9 SEQ1_DIR=/scratch/data/ce9/ce9.2bit SEQ1_LEN=/scratch/data/ce9/chrom.sizes SEQ1_CHUNK=1000000 SEQ1_LAP=10000 # QUERY: C. briggsae priPac2 SEQ2_DIR=/scratch/data/priPac2/priPac2.2bit SEQ2_LEN=/scratch/data/priPac2/chrom.sizes SEQ2_CHUNK=1000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/ce9/bed/blastzPriPac2.2010-09-29 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -verbose=2 -bigClusterHub=swarm \ -qRepeats=windowmaskerSdust -smallClusterHub=memk > do.log 2>&1 & # real 7m33.978s # trouble during chainRun due to missing files on memk kluster # finish that manually, then continuing: time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -continue=chainMerge \ -workhorse=hgwdev -verbose=2 -bigClusterHub=swarm \ -qRepeats=windowmaskerSdust -smallClusterHub=swarm \ > chainMerge.log 2>&1 & # real 3m14.461s cat fb.ce9.chainPriPac2Link.txt # 5878567 bases of 100286004 (5.862%) in intersection # swap, this is also in priPac2.txt mkdir /hive/data/genomes/priPac2/bed/blastz.ce9.swap cd /hive/data/genomes/priPac2/bed/blastz.ce9.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/ce9/bed/blastzPriPac2.2010-09-29/DEF \ -qRepeats=windowmaskerSdust -bigClusterHub=swarm \ -smallClusterHub=swarm -swap > swap.log 2>&1 & # real 1m6.706s cat fb.priPac2.chainCe9Link.txt # 6269492 bases of 133634773 (4.692%) in intersection ############################################################################ #### Blat sangerGene proteins to determine exons # (DONE - 2010-10-07 - hiram) ssh hgwdev cd /hive/data/genomes/ce9/bed mkdir blat.ce9SG.2010-10-07 ln -s blat.ce9SG.2010-10-07 blat.ce9SG cd blat.ce9SG pepPredToFa ce9 sangerPep sangerPep.fa # create nib directory from ce9.2bit mkdir nib cd nib for C in I II III IV V X M do twoBitToFa -seq=chr${C} /hive/data/genomes/ce9/ce9.2bit stdout \ | faToNib -softMask stdin chr${C}.nib done # The kluster run ssh swarm cd /hive/data/genomes/ce9/bed/blat.ce9SG cat << '_EOF_' > blatSome #!/bin/csh -fe blat -t=dnax -q=prot -out=pslx nib/$1.nib sgfa/$2.fa $3 '_EOF_' # << happy emacs chmod +x blatSome ls -1S /san/sanvol1/scratch/worms/ce9/nib > ce9.lst mkdir sgfa cd sgfa faSplit sequence ../sangerPep.fa 3000 sg ls -1S *.fa > ../sg.lst cd .. cat << '_EOF_' > template #LOOP blatSome $(root1) $(root2) {check out line psl/$(root1)/$(root2).psl} #ENDLOOP '_EOF_' # << happy emacs gensub2 ce9.lst sg.lst template jobList mkdir psl cd psl sed -e "s/.nib//" ../ce9.lst | xargs mkdir cd .. para create jobList para try ... check ... push ... etc # Completed: 20314 of 20314 jobs # CPU time in finished jobs: 76075s 1267.92m 21.13h 0.88d 0.002 y # IO & Wait Time: 57172s 952.86m 15.88h 0.66d 0.002 y # Average job time: 7s 0.11m 0.00h 0.00d # Longest finished job: 199s 3.32m 0.06h 0.00d # Submission to last job: 509s 8.48m 0.14h 0.01d cd /hive/data/genomes/ce9/bed/blat.ce9SG.2010-10-07 pslSort dirs raw.psl /tmp psl/* # Got 20314 files 143 files per mid file # -rw-rw-r-- 1 80091706 Oct 7 13:53 raw.psl pslReps -nohead -minCover=0.9 -minAli=0.9 raw.psl cooked.psl /dev/null # Processed 172014 alignments # -rw-rw-r-- 1 31268144 Oct 7 13:55 cooked.psl pslUniq cooked.psl ce9SG.psl # -rw-rw-r-- 1 30179810 Oct 7 13:58 ce9SG.psl cut -f 10 ce9SG.psl | sort -u > sgName.lst faSomeRecords sangerPep.fa sgName.lst ce9SG.fa # -rw-rw-r-- 1 13013108 Oct 7 12:22 sangerPep.fa # -rw-rw-r-- 1 13008952 Oct 7 14:00 ce9SG.fa grep "^>" ce9SG.fa | wc -l # 28103 grep "^>" sangerPep.fa | wc -l # 28152 pslxToFa ce9SG.psl ce9SG_ex.fa -liftTarget=genome.lft \ -liftQuery=protein.lft # -rw-rw-r-- 1 8073643 Oct 7 14:01 genome.lft # -rw-rw-r-- 1 6441476 Oct 7 14:01 protein.lft # -rw-rw-r-- 1 15264194 Oct 7 14:01 ce9SG_ex.fa wc -l *.psl *.lft *.fa sgName.lst # 28103 ce9SG.psl # 30368 cooked.psl # 172019 raw.psl # 178860 genome.lft # 178860 protein.lft # 290895 ce9SG.fa # 357720 ce9SG_ex.fa # 291039 sangerPep.fa # 28103 sgName.lst # 1555967 total # back on hgwdev ssh hgwdev cd /hive/data/genomes/ce9/bed/blat.ce9SG hgsql -N -e "select * from sangerGene;" ce9 | cut -f1-10 > sangerGene.gp # Need a reference table loaded, perl script makes up relationships cat << '_EOF_' >> mkRef.pl #!/usr/bin/env perl use warnings; use strict; my $argc = scalar(@ARGV); sub usage() { print STDERR "usage: ./mkRef.pl blastSGRef01.tab\n"; print STDERR "reads ../sangerGene/genePred.tab and ", "../sangerGene/fixedOrfToGene.tab\n"; print STDERR "writes references to the given input file, ", "e.g. blastSGRef01.tab\n"; } if (1 != $argc) { usage; exit 255; } my $outFile = shift; my %orfToGene; # key is all lowercase orf name, value is gene name my $inFile="../sangerGene/fixedOrfToGene.tab"; open (FH,"<$inFile") or die "can not read $inFile"; while (my $line=) { chomp $line; my ($orf, $gene) = split('\s+',$line); $orfToGene{lc($orf)} = $gene } close (FH); my %links; # key is all lowercase orf name, value is SwissProt name $inFile="../sangerGene/sangerLinks.tab"; open (FH,"<$inFile") or die "can not read $inFile"; while (my $line=) { chomp $line; my ($orf, $SP, $comments) = split('\s+',$line); $links{lc($orf)} = $SP } close (FH); my %ce9Position; # key is all lowercase orf name, value is ce9 position $inFile="sangerGene.gp"; open (FH,"<$inFile") or die "can not read $inFile"; while (my $line=) { chomp $line; my ($orf, $chr, $strand, $start, $end, $other) = split('\s+',$line); $ce9Position{lc($orf)} = sprintf("%s:%d-%d", $chr, $start, $end); } close (FH); open (OF,">$outFile") or die "can not write to $outFile"; $inFile="sgName.lst"; open (FH,"<$inFile") or die "can not read $inFile"; while (my $orf=) { chomp $orf; if (!exists($orfToGene{lc($orf)})) { $orfToGene{lc($orf)} = ""; } if (!exists($links{lc($orf)})) { $links{lc($orf)} = ""; } if (!exists($ce9Position{lc($orf)})) { $ce9Position{lc($orf)} = ""; } printf OF "%s\t%s\t%s\t%s\t%s\n", $orf, $orfToGene{lc($orf)}, $ce9Position{lc($orf)}, $links{lc($orf)}, ""; } close (FH); close (OF); '_EOF_' # << happy emacs chmod +x ./mkRef.pl ./mkRef.pl blastSGRef01.tab wc -l blastSGRef01.tab # 28103 blastSGRef01.tab hgLoadSqlTab ce9 blastSGRef01 ~/kent/src/hg/lib/blastRef.sql \ blastSGRef01.tab wc -l sgName.lst blastSGRef01.tab ce9SG.psl # 28103 sgName.lst # 28103 blastSGRef01.tab # 28103 ce9SG.psl # 84309 total # And load the protein sequences hgPepPred ce9 generic blastSGPep01 ce9SG.fa ############################################################################ # multiz10Way MAF FRAMES (DONE - 2010-10-12 - Hiram) ssh hgwdev mkdir /hive/data/genomes/ce9/bed/multiz10way/frames cd /hive/data/genomes/ce9/bed/multiz10way/frames mkdir genes # The following is adapted from the ce6 6-way sequence #------------------------------------------------------------------------ # get the genes for all genomes # using sangerGene for ce9 # using blastCe6SG for cb3, caePb2, caeRem3, caeJap1 priPac1 hgsql -N -e "select * from sangerGene" ce9 | cut -f 1-10 \ | genePredSingleCover stdin stdout | gzip -2c \ > genes/ce9.gp.gz for qDB in `sed -e "s/ce9 //; s/melInc1 //" ../species.list` do geneTbl=ws210Gene echo hgsql -N -e \"'select * from '$geneTbl\;\" ${qDB} hgsql -N -e "select * from $geneTbl" ${qDB} | cut -f 2-11 \ | genePredSingleCover stdin stdout | gzip -2c \ > genes/$qDB.gp.gz done qDB=melInc1 geneTbl=blastCe9SG echo hgsql -N -e \"'select * from '$geneTbl\;\" ${qDB} hgsql -N -e "select * from $geneTbl" ${qDB} | cut -f 2-100 \ > /scratch/tmp/$qDB.tmp.psl mrnaToGene -allCds /scratch/tmp/$qDB.tmp.psl stdout \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/$qDB.tmp.gz rm -f /scratch/tmp/$qDB.tmp.psl mv /scratch/tmp/$qDB.tmp.gz genes/$qDB.gp.gz cd /hive/data/genomes/ce9/bed/multiz10way/frames cat ../maf/*.maf | nice -n +19 genePredToMafFrames ce9 \ cat ../anno/maf/*.maf | nice -n +19 genePredToMafFrames ce9 \ stdin stdout \ `sed -e "s#\([a-zA-Z0-9]*\)#\1 genes/\1.gp.gz#g" ../species.list` \ | gzip > multiz10way.mafFrames.gz # the sed trick there with ../species.list produces the following argument # list: # ce9 genes/ce9.gp.gz cb3 genes/cb3.gp.gz caeRem3 genes/caeRem3.gp.gz \ # caePb2 genes/caePb2.gp.gz caeJap3 genes/caeJap3.gp.gz \ # haeCon1 genes/haeCon1.gp.gz priPac2 genes/priPac2.gp.gz \ # melHap1 genes/melHap1.gp.gz melInc1 genes/melInc1.gp.gz \ # bruMal1 genes/bruMal1.gp.gz ssh hgwdev cd /hive/data/genomes/ce9/bed/multiz10way/frames nice -n +19 hgLoadMafFrames ce9 multiz10wayFrames multiz10way.mafFrames.gz ####################################################################### # verify all.joiner has everything (DONE - 2010-10-18 - Hiram) # edit all.joiner until all these commands are successful cd $HOME/kent/src/hg/makeDb/schema joinerCheck -tableCoverage -database=ce9 ./all.joiner joinerCheck -keys -database=ce9 ./all.joiner joinerCheck -times -database=ce9 ./all.joiner joinerCheck -all -database=ce9 ./all.joiner # the -all command will complain about other databases on hgwdev # that are not specified in all.joiner. There are a lot of test # databases on hgwdev ####################################################################### ## Creating downloads (DONE - 2010-10-18 - Hiram) # There is only one chrom, make its trfMaskChrom file exist ssh hgwdev cd /hive/data/genomes/ce9 nice -n +19 /cluster/bin/scripts/makeDownloads.pl ce9 > downloads.log 2>&1 ## *!* EDIT THE README.txt FILES *!* ## ########################################################################## # hgPal downloads (DONE - Hiram - 2010-10-18) ssh hgwdev screen bash mkdir /hive/data/genomes/ce9/bed/multiz10way/pal cd /hive/data/genomes/ce9/bed/multiz10way/pal mz=multiz10way gp=refGene db=ce9 echo $db > order.lst hgsql $db -e "select settings from trackDb where tableName='$mz'" \ | sed 's/\\n/\n/g' | grep sGroup_ | sed -e "s#sGroup_[a-zA-Z]* ##" \ | tr ' ' '\n' >> order.lst mkdir exonAA exonNuc ppredAA ppredNuc for j in `sort -nk 2 /hive/data/genomes/$db/chrom.sizes | cut -f1` do echo "date" echo "mafGene -chrom=$j $db $mz $gp order.lst stdout | \ gzip -c > ppredAA/$j.ppredAA.fa.gz" echo "mafGene -chrom=$j -noTrans $db $mz $gp order.lst stdout | \ gzip -c > ppredNuc/$j.ppredNuc.fa.gz" echo "mafGene -chrom=$j -exons -noTrans $db $mz $gp order.lst stdout | \ gzip -c > exonNuc/$j.exonNuc.fa.gz" echo "mafGene -chrom=$j -exons $db $mz $gp order.lst stdout | \ gzip -c > exonAA/$j.exonAA.fa.gz" done > refGene.jobs time sh -x refGene.jobs > refGene.job.log 2>&1 & sleep 1 tail -f refGene.job.log # real 11m36.350s zcat exonAA/*.gz | gzip -c > $gp.$mz.exonAA.fa.gz zcat exonNuc/*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz zcat ppredAA/*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz zcat ppredNuc/*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz rm -rf exonAA exonNuc ppredAA ppredNuc mz=multiz10way gp=sangerGene db=ce9 mkdir exonAA exonNuc ppredAA ppredNuc for j in `sort -nk 2 /hive/data/genomes/$db/chrom.sizes | awk '{print $1}'` do echo "date" echo "mafGene -chrom=$j $db $mz $gp order.lst stdout | \ gzip -c > ppredAA/$j.ppredAA.fa.gz" echo "mafGene -chrom=$j -noTrans $db $mz $gp order.lst stdout | \ gzip -c > ppredNuc/$j.ppredNuc.fa.gz" echo "mafGene -chrom=$j -exons -noTrans $db $mz $gp order.lst stdout | \ gzip -c > exonNuc/$j.exonNuc.fa.gz" echo "mafGene -chrom=$j -exons $db $mz $gp order.lst stdout | \ gzip -c > exonAA/$j.exonAA.fa.gz" done > $gp.$mz.jobs time sh -x $gp.$mz.jobs > $gp.$mz.job.log 2>&1 & sleep 1 tail -f $gp.$mz.job.log # real 19m29.826s zcat exonAA/c*.gz | gzip -c > $gp.$mz.exonAA.fa.gz zcat exonNuc/c*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz zcat ppredAA/c*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz zcat ppredNuc/c*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz rm -rf exonAA exonNuc ppredAA ppredNuc # we're only distributing exons at the moment pd=/hive/data/genomes/ce9/goldenPath/multiz10way/alignments mkdir $pd ln -s `pwd`/*.exonAA.fa.gz $pd/ ln -s `pwd`/*.exonNuc.fa.gz $pd/ ######################################################################### # verify all.joiner has everything (DONE - 2010-10-18 - Hiram) # edit all.joiner until all these commands are successful cd $HOME/kent/src/hg/makeDb/schema joinerCheck -tableCoverage -database=ce9 ./all.joiner joinerCheck -keys -database=ce9 ./all.joiner joinerCheck -times -database=ce9 ./all.joiner joinerCheck -all -database=ce9 ./all.joiner # the -all command will complain about other databases on hgwdev # that are not specified in all.joiner. There are a lot of test # databases on hgwdev ####################################################################### ## Creating pushQ (DONE - 2010-10-18 - Hiram) ssh hgwdev mkdir /hive/data/genomes/ce9/pushQ cd /hive/data/genomes/ce9/pushQ makePushQSql.pl ce9 > ce9.sql 2> errorLog.out ## check the errorLog.out for anything that needs to be fixed ## copy ce9.sql to hgwbeta:/tmp ## then on hgwbeta: hgsql qapushq < ce9.sql ########################################################################## # set this as the defaultDb (DONE - 2010-10-19 - Hiram) hgsql -e 'update defaultDb set name="ce9" where name="ce6";' \ hgcentraltest ########################################################################## # LIFTOVER TO ce10 (DONE - 2011-05-24 - Hiram ) mkdir /hive/data/genomes/ce9/bed/blat.ce10.2011-05-24 cd /hive/data/genomes/ce9/bed/blat.ce10.2011-05-24 # -debug run to create run dir, preview scripts... doSameSpeciesLiftOver.pl \ -buildDir=`pwd` \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ -ooc=/hive/data/genomes/ce9/jkStuff/ce9.11.ooc -debug ce9 ce10 # Real run: time nice -n +19 doSameSpeciesLiftOver.pl \ -buildDir=`pwd` \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ -ooc=/hive/data/genomes/ce9/jkStuff/ce9.11.ooc ce9 ce10 > do.log 2>&1 # real 4m22.712s # verify it works on genome-test #############################################################################