# for emacs: -*- mode: sh; -*- # Caenorhabditis japonica # Washington University School of Medicine GSC and Sanger Institute # WUSTL version 4.0.1 Jan 2009 # $Id: caeJap2.txt,v 1.3 2009/09/21 20:26:39 hiram Exp $ ######################################################################## # Download sequence (DONE - 2009-07-22 - Hiram) mkdir -p /hive/data/genomes/caeJap2/wustl cd /hive/data/genomes/caeJap2/wustl wget --timestamping \ ftp://genome.wustl.edu/pub/organism/Invertebrates/Caenorhabditis_japonica/assembly/Caenorhabditis_japonica-4.0.1/ASSEMBLY wget --timestamping \ ftp://genome.wustl.edu/pub/organism/Invertebrates/Caenorhabditis_japonica/assembly/Caenorhabditis_japonica-4.0.1/README wget --timestamping \ ftp://genome.wustl.edu/pub/organism/Invertebrates/Caenorhabditis_japonica/assembly/Caenorhabditis_japonica-4.0.1/output/*.* cat << '_EOF_' > superToAgp.pl #!/usr/bin/env perl use strict; use warnings; my %superSizes; open (FH, ") { next if ($line =~ m/^#/); next if ($line =~ m/^total/); my ($name, $size, $rest) = split('\s+',$line,3); $superSizes{$name} = $size; } close (FH); my %contigSizes; open (FH, ") { next if ($line =~ m/^#/); next if ($line =~ m/^total/); my ($name, $size, $rest) = split('\s+',$line,3); $contigSizes{$name} = $size; } close (FH); my $superName = ""; my $contigPart = ""; my $chromStart = 1; my $chromEnd = 1; my $id = 1; my $a = ""; my $b = ""; my $size = 0; my $skipSuper = 0; open (CB, ">Contig.bed") or die "can not write to Contig.bed"; open (FH, "zcat supercontigs.gz|") or die "can not read supercontigs.gz"; while (my $line = ) { next if ($line =~ m/^\s*$/); chomp $line; if ($line =~ m/^supercontig /) { ($a, $b) = split('\s+', $line); $superName = $b; $superName =~ s/Superc/C/; if (!exists($superSizes{$superName})) { $skipSuper = 1; next; } else { $skipSuper = 0; } my $superEnd = $chromStart + $superSizes{$superName} - 1; if ($chromEnd > 1) { printf CB "chrUn\t%d\t%d\t%s\n", $chromStart-1+1000, $superEnd+1000, $superName; } else { printf CB "chrUn\t%d\t%d\t%s\n", $chromStart-1, $superEnd, $superName; } if ($chromEnd > 1) { $size = 1000; $chromEnd = $chromStart + $size - 1; printf "chrUn\t%d\t%d\t%d\tN\t%d\tcontig\tno\n", $chromStart, $chromEnd, $id++, $size; $chromStart = $chromEnd + 1; } next; } elsif ($line =~ m/^contig /) { next if ($skipSuper > 0); ($a, $b) = split('\s+', $line); $contigPart = $b; die "can not find size for $contigPart" if (!exists($contigSizes{$contigPart})); $size = $contigSizes{$contigPart}; $chromEnd = $chromStart + $size - 1; printf "chrUn\t%d\t%d\t%d\tW\t%s\t1\t%d\t+\n", $chromStart, $chromEnd, $id++, $contigPart, $size; $chromStart = $chromEnd + 1; next; } elsif ($line =~ m/^gap /) { next if ($skipSuper > 0); my ($g, $gapSize, $gapDev, $star, $x) = split('\s+', $line); $size = $gapSize; $size = 10 if ($size < 0); die "gap size is 0" if ($size == 0); $chromEnd = $chromStart + $size - 1; printf "chrUn\t%d\t%d\t%d\tN\t%d\tfragment\tyes\n", $chromStart, $chromEnd, $id++, $size; $chromStart = $chromEnd + 1; } else { die "do not recognize line: $line"; } } close (FH); close (CB); '_EOF_' # << happy emacs chmod +x superToAgp.pl ./superToAgp.pl > chrUn.agp qaToQac contigs.fa.qual.gz contigs.qac qacAgpLift chrUn.agp contigs.qac chrUn.qual.qac # and create a ctgPos2 file to show ContigN locations awk '{ size = $3-$2 printf "%s\t%d\tchrUn\t%d\t%d\tW\n", $4, size, $2, $3 }' Contig.bed > ctgPos2.tab ######################################################################## # initial genome browser build (DONE - 2009-07-23 - Hiram) cd /hive/data/genomes/caeJap2 cat << '_EOF_' > caeJap2.config.ra # Config parameters for makeGenomeDb.pl: db caeJap2 clade worm genomeCladePriority 10 scientificName Caenorhabditis japonica commonName C. japonica assemblyDate Jan. 2009 assemblyLabel Washington University School of Medicine GSC C. japonica 4.0.1 orderKey 881 mitoAcc none fastaFiles /hive/data/genomes/caeJap2/wustl/contigs.fa.gz agpFiles /hive/data/genomes/caeJap2/wustl/chrUn.agp qualFiles /hive/data/genomes/caeJap2/wustl/chrUn.qual.qac dbDbSpeciesDir worm taxId 281687 '_EOF_' # << happy emacs # verify sequence and AGP specs are OK makeGenomeDb.pl -stop=agp caeJap2.config.ra makeGenomeDb.pl -continue=db caeJap2.config.ra > makeGenomeDb.db.log 2>&1 ln -s `pwd`/caeJap2.unmasked.2bit /gbdb/caeJap2/caeJap2.2bit # your personal browser should be functioning now ######################################################################## # Repeat Masker (DONE - 2009-07-23 - Hiram) # repeat masker was run, but it did not mask much, so that table # was removed so that windowmaskerSdust could be the masking track # when fetching DNA from the browser mkdir /hive/data/genomes/caeJap2/bed/repeatMasker cd /hive/data/genomes/caeJap2/bed/repeatMasker doRepeatMasker.pl -buildDir=`pwd` caeJap2 > do.log 2>&1 cat faSize.rmsk.txt # 170655152 bases (41359841 N's 129295311 real 126749989 upper 2545322 lower) # in 1 sequences in 1 files # %1.49 masked total, %1.97 masked real ssh hgwdev hgsql -e "drop table chrUn_rmsk;" caeJap2 # this leaves the interrupted repeats track showing on genome-test ######################################################################## # Simple Repeats (DONE - 2009-07-23 - Hiram) mkdir /hive/data/genomes/caeJap2/bed/simpleRepeat cd /hive/data/genomes/caeJap2/bed/simpleRepeat doSimpleRepeat.pl -buildDir=`pwd` caeJap2 > do.log 2>&1 ######################################################################## # Window Masker (DONE - 2009-07-23 - Hiram) mkdir /hive/data/genomes/caeJap2/bed/windowMasker cd /hive/data/genomes/caeJap2/bed/windowMasker doWindowMasker.pl -buildDir=`pwd` caeJap2 > do.log 2>&1 # load this initial data to get ready to clean it ssh hgwdev cd /hive/data/genomes/caeJap2/bed/windowMasker hgLoadBed caeJap2 windowmaskerSdust windowmasker.sdust.bed.gz featureBits -countGaps caeJap2 windowmaskerSdust # 98788858 bases of 170655152 (57.888%) in intersection # eliminate the gaps from the masking featureBits caeJap2 -not gap -bed=notGap.bed time nice -n +19 featureBits caeJap2 windowmaskerSdust notGap.bed \ -bed=stdout | gzip -c > cleanWMask.bed.gz # 57429460 bases of 129295754 (44.417%) in intersection # reload track to get it clean hgLoadBed caeJap2 windowmaskerSdust cleanWMask.bed.gz # Loaded 874463 elements of size 4 featureBits -countGaps caeJap2 windowmaskerSdust # 57429460 bases of 170655152 (33.652%) in intersection # mask the sequence with this clean mask zcat cleanWMask.bed.gz \ | twoBitMask ../../caeJap2.unmasked.2bit stdin \ -type=.bed caeJap2.cleanWMSdust.2bit twoBitToFa caeJap2.cleanWMSdust.2bit stdout | faSize stdin \ > caeJap2.cleanWMSdust.faSize.txt cat caeJap2.cleanWMSdust.faSize.txt # 170655152 bases (41359841 N's 129295311 real 71865902 upper # 57429409 lower) in 1 sequences in 1 files # %33.65 masked total, %44.42 masked real ######################################################################### # MASK SEQUENCE WITH WM+TRF (DONE - 2009-07-23 - Hiram) cd /hive/data/genomes/caeJap2 twoBitMask -add bed/windowMasker/caeJap2.cleanWMSdust.2bit \ bed/simpleRepeat/trfMask.bed caeJap2.2bit twoBitToFa caeJap2.2bit stdout | faSize stdin \ > faSize.caeJap2.wmskSdust.TRF.txt cat faSize.caeJap2.wmskSdust.TRF.txt # 170655152 bases (41359841 N's 129295311 real 71749466 upper # 57545845 lower) in 1 sequences in 1 files # %33.72 masked total, %44.51 masked real # create symlink to gbdb ssh hgwdev rm /gbdb/caeJap2/caeJap2.2bit ln -s `pwd`/caeJap2.2bit /gbdb/caeJap2/caeJap2.2bit ######################################################################## # add ctgPos2 track (DONE - 2008-04-03 - Hiram) ssh hgwdev cd /hive/data/genomes/caeJap2/wustl hgLoadSqlTab caeJap2 ctgPos2 $HOME/kent/src/hg/lib/ctgPos2.sql ctgPos2.tab ######################################################################### # MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2009-07-23 - Hiram) # Use -repMatch=100 (based on size -- for human we use 1024, and # worm size is ~5.1% of human judging by gapless caeJap2 vs. hg18 genome # size from featureBits. So we would use 50, but that yields a very # high number of tiles to ignore, especially for a small more compact # genome. Bump that up a bit to be more conservative. cd /hive/data/genomes/caeJap2 blat caeJap2.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=jkStuff/caeJap2.11.ooc -repMatch=100 # Wrote 14991 overused 11-mers to jkStuff/caeJap2.11.ooc cd /hive/data/genomes/caeJap2/jkStuff # note the full size of chrUn tail ../chrom.sizes # chrUn 170655152 # use that full size in the following awk: awk '{ printf "%d\t%s\t%d\tchrUn\t170655152\n", $2, $4, $3-$2 }' ../wustl/Contig.bed > caeJap2.chrUn.lift ######################################################################### ## Make maskedContigs (DONE - 2009-07-23 - Hiram) mkdir /hive/data/genomes/caeJap2/maskedContigs cd /hive/data/genomes/caeJap2/maskedContigs ln -s ../jkStuff/caeJap2.chrUn.lift ./supers.lift cp -p /hive/data/genomes/caeJap1/maskedContigs/lft2BitToFa.pl . time nice -n +19 ./lft2BitToFa.pl ../caeJap2.2bit supers.lift > supers.fa faToTwoBit supers.fa caeJap2.supers.2bit # real 4m18.841s # verify size is not broken from this procedure, should be same number # of real, upper and lower. Only the N count and total count changes twoBitToFa caeJap2.supers.2bit stdout | faSize stdin # 163120152 bases (33824841 N's 129295311 real 71749466 upper # 57545845 lower) in 7536 sequences in 1 files # %35.28 masked total, %44.51 masked real twoBitToFa ../caeJap2.2bit stdout | faSize stdin # 170655152 bases (41359841 N's 129295311 real 71749466 upper # 57545845 lower) in 1 sequences in 1 files # %33.72 masked total, %44.51 masked real twoBitInfo caeJap2.supers.2bit stdout \ | sort -k2rn > caeJap2.supers.sizes # copy all of this stuff to the klusters: cd /hive/data/genomes/caeJap2 mkdir /hive/data/staging/data/caeJap2 cp -p jkStuff/caeJap2.11.ooc jkStuff/caeJap2.chrUn.lift \ maskedContigs/caeJap2.supers.2bit maskedContigs/caeJap2.supers.sizes \ chrom.sizes caeJap2.2bit /hive/data/staging/data/caeJap2 ######################################################################### # GENBANK AUTO UPDATE (DONE - 2009-07-23 - Hiram) # align with latest genbank process. ssh hgwdev cd ~/kent/src/hg/makeDb/genbank cvsup # edit etc/genbank.conf to add caeJap2 just before caeJap1 # caeJap2 (C. remanei) caeJap2.serverGenome = /hive/data/genomes/caeJap2/caeJap2.2bit caeJap2.clusterGenome = /scratch/data/caeJap2/caeJap2.2bit caeJap2.ooc = /scratch/data/caeJap2/caeJap2.11.ooc caeJap2.lift = /scratch/data/caeJap2/caeJap2.chrUn.lift caeJap2.refseq.mrna.native.pslCDnaFilter = ${lowCover.refseq.mrna.native.pslCDnaFilter} caeJap2.refseq.mrna.xeno.pslCDnaFilter = ${lowCover.refseq.mrna.xeno.pslCDnaFilter} caeJap2.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter} caeJap2.genbank.mrna.xeno.pslCDnaFilter = ${lowCover.genbank.mrna.xeno.pslCDnaFilter} caeJap2.genbank.est.native.pslCDnaFilter = ${lowCover.genbank.est.native.pslCDnaFilter} caeJap2.refseq.mrna.native.load = yes caeJap2.refseq.mrna.xeno.load = yes caeJap2.refseq.mrna.xeno.loadDesc = yes caeJap2.genbank.mrna.xeno.load = yes caeJap2.genbank.est.native.load = yes caeJap2.genbank.est.native.loadDesc = no caeJap2.downloadDir = caeJap2 caeJap2.perChromTables = no cvs ci -m "Added caeJap2" etc/genbank.conf # update /cluster/data/genbank/: make etc-update ssh genbank screen # use a screen to manage this job cd /cluster/data/genbank time nice -n +19 bin/gbAlignStep -initial caeJap2 & # logFile: var/build/logs/2009.07.23-14:13:01.caeJap2.initalign.log # real 137m55.074s # load database when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad caeJap2 # logFile: var/dbload/hgwdev/logs/2009.08.10-16:42:51.dbload.log # real 16m14.090s # enable daily alignment and update of hgwdev cd ~/kent/src/hg/makeDb/genbank cvsup # add caeJap2 to: etc/align.dbs etc/hgwdev.dbs cvs ci -m "Added caeJap2 - C. japonica" etc/align.dbs etc/hgwdev.dbs make etc-update # done - 2009-09-21 - Hiram #########################################################################