# for emacs: -*- mode: sh; -*- # Gorilla gorilla ######################################################################### # DOWNLOAD SEQUENCE (DONE braney 2008-10-07) mkdir /hive/data/genomes/gorGor1 cd /hive/data/genomes/gorGor1 ln -s `pwd` /cluster/data/gorGor1 mkdir /hive/data/genomes/gorGor1/sanger cd /hive/data/genomes/gorGor1/sanger wget --timestamping \ ftp://ftp.sanger.ac.uk/pub4/gorilla/gorilla_draft_genome_31265.agp.gz wget --timestamping \ ftp://ftp.sanger.ac.uk/pub/sequences/gorilla/draft_assembly_v0.1/gorilla_draft_genome_31265.fastq.gz wget --timestamping \ ftp://ftp.sanger.ac.uk/pub/sequences/gorilla/draft_assembly_v0.1/gorilla_draft_genome_31265.fragments.gz XXX - working 2008-10-21 # there are problems with all of these files qaToQac assembly.quals.gz stdout | qacAgpLift assembly.agp stdin gorGor1.qual.qac cut -f 1 assembly.agp | uniq -c | wc -l # Number of scaffolds: 116467 ######################################################################### # Create .ra file and run makeGenomeDb.pl(DONE braney 2008-10-07) ssh kolossus cd /cluster/data/gorGor1 cat << '_EOF_' >gorGor1.config.ra # Config parameters for makeGenomeDb.pl: db gorGor1 clade mammal genomeCladePriority 12 scientificName Gorilla gorilla gorilla commonName Gorilla assemblyDate Oct. 2008 assemblyLabel Sanger Institute genome_31265 orderKey 27 mitoAcc NC_011120 # fastaFiles # /hive/data/genomes/gorGor1/sanger/gorilla_draft_genome_31265.fragments.gz fastaFiles /hive/data/genomes/gorGor1/sanger/clean.fragments.fa.gz # fastaFiles /hive/data/genomes/gorGor1/sanger/genome_31265.fa # agpFiles /hive/data/genomes/gorGor1/sanger/gorilla_draft_genome_31265.agp.gz agpFiles /hive/data/genomes/gorGor1/sanger/clean.agp # qualFiles none dbDbSpeciesDir gorilla '_EOF_' # << happy emacs # there is some funny business going on with the agp file, it doesn't # specify gaps that are at the beginning of the scaffold. Our tools # will not work with such an AGP file, so fix it up: cd /hive/data/genomes/gorGor1/sanger cat << '_EOF_' > fixAgp.pl #!/usr/bin/env perl use strict; use warnings; open (FH,"zcat gorilla_draft_genome_31265.agp.gz|") or die "can not zcat gorilla_draft_genome_31265.agp.gz"; while (my $line = ) { my ($name, $start, $end, $frag, $type, $rest) = split('\s+', $line, 6); if ((1 == $frag) && (1 != $start)) { printf "%s\t1\t%d\t1\tN\t%d\tfragment\tyes\n", $name, $start-1, $start-1; } print "$line"; } close (FH); '_EOF_' # << happy emacs chmod +x fixAgp.pl ./fixAgp.pl > clean.agp # running these step-wise to work out difficulties at each step makeGenomeDb.pl -stop=seq -workhorse=hgwdev -verbose=2 gorGor1.config.ra makeGenomeDb.pl -continue=agp -stop=agp -workhorse=hgwdev \ -verbose=2 gorGor1.config.ra makeGenomeDb.pl -continue=db -stop=db -workhorse=hgwdev \ -verbose=2 gorGor1.config.ra # the names are too long for our normal chromInfo index: LOAD DATA INFILE '/hive/data/genomes/gorGor1/bed/chromInfo/chromInfo.tab' INTO TABLE chromInfo mySQL error 1062: Duplicate entry 'Supercontig_0000' for key 1 # Fixup the chromInfo.sql file: cd /hive/data/genomes/gorGor1/bed/chromInfo cp /cluster/home/hiram/kent/src/hg/lib/chromInfo.sql . # PRIMARY KEY(chrom(20)) - was 16 before hgLoadSqlTab gorGor1 chromInfo *.sql *.tab # continuing with the rest of the steps from jkStuff/makeDb.csh, # starting with: hgGoldGapGl -noGl gorGor1 /cluster/data/gorGor1/gorGor1.agp makeGenomeDb.pl -continue=dbDb -stop=dbDb -workhorse=hgwdev \ -verbose=2 gorGor1.config.ra makeGenomeDb.pl -continue=trackDb -stop=trackDb -workhorse=hgwdev \ -verbose=2 gorGor1.config.ra cut -f 2 chrom.sizes | ave stdin # Q1 855.000000 # median 1216.000000 # Q3 2773.000000 # average 3810.123774 # min 500.000000 # max 942632.000000 # count 609861 # total 2323645895.000000 # standard deviation 9559.595923 # looking for the N50 value: awk '{sum += $2; print NR, sum, $0}' chrom.sizes | less # scanning down until the sum is 1/2 of the total # 2323645895 / 2 = 1161822947 # where it looks like: # 45571 1161796841 Supercontig_0163790 11279 # 45572 1161808120 Supercontig_0164808 11279 # 45573 1161819399 Supercontig_0249037 11279 # 45574 1161830677 Supercontig_0021829 11278 # 45575 1161841955 Supercontig_0049999 11278 # thus N50 is size 11,278 bases at #45,574 ######################################################################### ## Repeat Masker (DONE - 2008-11-03,04 - Hiram) screen # to manage this several day job mkdir /hive/data/genomes/gorGor1/bed/repeatMasker cd /hive/data/genomes/gorGor1/bed/repeatMasker time doRepeatMasker.pl -verbose=2 \ -workhorse=hgwdev -bigClusterHub=swarm \ -buildDir=`pwd` gorGor1 > do.log 2>&1 & # real 360m45.970s # real 797m7.301s cat faSize.rmsk.txt # 2323645895 bases (279671235 N's 2043974660 real 1079526671 upper 964447989 # lower) in 609861 sequences in 1 files # %41.51 masked total, %47.18 masked real # from the do.log file, to go into the bigZips/README file: # RepeatMasker version development-$Id: RepeatMasker,v 1.22 2008/08/08 # 16:44:40 angie Exp $ # Aug 7 2008 (open-3-2-6) version of RepeatMasker # grep RELEASE /scratch/data/RepeatMasker/Libraries/RepeatMaskerLib.embl # CC RELEASE 20080801; ######################################################################### # SIMPLE REPEATS TRF (DONE - 2008-11-03 - Hiram) screen # use a screen to manage this job mkdir /hive/data/genomes/gorGor1/bed/simpleRepeat cd /hive/data/genomes/gorGor1/bed/simpleRepeat # time doSimpleRepeat.pl -verbose=2 \ -buildDir=`pwd` gorGor1 > do.log 2>&1 & # real 1471m34.533s # there were delays due to hang-ups when trying to talk to kolossus cat fb.simpleRepeat # 187515375 bases of 2075548667 (9.034%) in intersection # after RM run is done, add this mask: cd /hive/data/genomes/gorGor1 rm gorGor1.2bit twoBitMask gorGor1.rmsk.2bit -add bed/simpleRepeat/trfMask.bed gorGor1.2bit # can safely ignore warning about >=13 fields in bed file twoBitToFa gorGor1.2bit stdout | faSize stdin > gorGor1.2bit.faSize.txt # 2323645895 bases (279671235 N's 2043974660 real 1077776023 upper 966198637 # lower) in 609861 sequences in 1 files # %41.58 masked total, %47.27 masked real # link to gbdb ln -s `pwd`/gorGor1.2bit /gbdb/gorGor1 ########################################################################### # prepare for kluster runs (DONE _ 2008-11-04 - Hiram) # compare to size of real bases to adjust the repMatch # hg18: 2881421696 # gorGor1: 2043974660 # thus: 1024 * 2043974660/2881421696 = 726 # rounding up to 800 for a more conservative masking cd /hive/data/genomes/gorGor1 time blat gorGor1.2bit \ /dev/null /dev/null -tileSize=11 -makeOoc=gorGor1.11.ooc -repMatch=800 # Wrote 23284 overused 11-mers to gorGor1.11.ooc # real 2m3.365s # and staging data for push to kluster nodes mkdir /hive/data/staging/data/gorGor1 cp -p gorGor1.2bit chrom.sizes gorGor1.11.ooc \ /hive/data/staging/data/gorGor1 # request to cluster admin to push this to the kluster nodes # /scratch/data/ ########################################################################### # add NCBI identifiers to the dbDb (DONE - 2008-11-03 - Hiram) hgsql -e 'update dbDb set sourceName="Sanger Institute Oct 2008 (NCBI project 31265, CABD01000000)" where name="gorGor1";' hgcentraltest ########################################################################### # BLATSERVERS ENTRY (DONE - 2007-11-04 - Hiram) # After getting a blat server assigned by the Blat Server Gods, ssh hgwdev hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("gorGor1", "blat13", "17790", "1", "0"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("gorGor1", "blat13", "17791", "0", "1");' \ hgcentraltest # test it with some sequence ############################################################################ # Create quality track (DONE - 2008-11-10 - Hiram) mkdir /hive/data/genomes/gorGor1/sanger/fastq cd /hive/data/genomes/gorGor1/sanger/fastq # the delivered fastq file is full of sequences that have nothing to # do with the Supercontigs, scan it and select out only that business # that belongs, and fixup the chrM sequence name (although this chrM is # different than the one from NC_011120) zcat ../gorilla_draft_genome_31265.fastq.gz | awk ' BEGIN { done = 0 } { if (match($0,"^@3_42962700_339")) { done = 1; } if (match($0,"^@gi.5835149.ref.NC_001645.1")) { print "@chrM gi.5835149.ref.NC_001645.1 Supercontig_" } else { if (0 == done) { print } } } ' | gzip -c > trimmed.fastq.gz # after re-writing fastqToFa to get it to properly parse multi-line # fastq files: fastqToFa -qual=gorGor1.qual.fa -nameVerify="Supercontig_" -noErrors \ -verbose=2 -qualSizes=gorGor1.qual.sizes \ trimmed.fastq.gz stdout | gzip -c > gorGor1.withAllNs.fa.gz # And then, a lot of these records are all N's - so, eliminate them: faCount gorGor1.fa.gz > gorGor1.faCount.txt egrep "^Super|^chr" gorGor1.withAllNs.faCount.txt \ | awk '{if($2 == $7) { print }}' \ > all.Ns.names egrep "^Super|^chr" gorGor1.withAllNs.faCount.txt \ | awk '{if($2 != $7) { print }}' \ > not.all.Ns.names cut -f 1 not.all.Ns.names \ | faSomeRecords gorGor1.withAllNs.fa.gz stdin stdout | gzip -c \ > gorGor1.fa.gz cat << '_EOF_' > cleanQuals.pl #!/usr/bin/env perl use strict; use warnings; my %allowList; my $allowListSize = 0; open (FH, "cut -f 1 not.all.Ns.names|") or die "can not cut -f 1 not.all.Ns.names"; while (my $line = ) { chomp $line; $allowList{$line} = 1; ++$allowListSize; } close (FH); print STDERR "working with a list of $allowListSize names\n"; my $allowed = 0; my $allowedCount = 0; open (CL, ">clean.gorGor1.qual.fa") or die "can not write to clean.gorGor1.qual.fa"; open (FH, ") { if ($line =~ m/^>Supercontig_|^>chrM /) { my $name = $line; chomp $name; $name =~ s/^>//; $name =~ s/ .*//; if (exists($allowList{$name})) { $allowed = 1; ++$allowedCount; } else { $allowed = 0; } } if ($allowed) { print CL "$line"; } } close (FH); print STDERR "passed through $allowedCount sequences\n"; '_EOF_' # << happy emacs ./cleanQuals.pl # working with a list of 609861 names # passed through 609861 sequences cd /cluster/data/gorGor1/bed/qual ln -s ../../sanger/fastq/clean.gorGor1.qual.fa gorGor1.qual.fa qaToQac gorGor1.qual.fa gorGor1.qual.qac qacToWig -fixed gorGor1.qual.qac stdout | wigEncode stdin qual.{wig,wib} # Converted stdin, upper limit 93.00, lower limit 0.00 # real 20m44.535s # loading table: rm -f /gbdb/gorGor1/wib/qual.wib ln -s /cluster/data/gorGor1/bed/qual/qual.wib /gbdb/gorGor1/wib/ time hgLoadWiggle -tmpDir=/scratch/tmp \ -pathPrefix=/gbdb/gorGor1/wib gorGor1 quality qual.wig # real 1m8.385s ############################################################################