# for emacs: -*- mode: sh; -*- # This file describes browser build for the oreNil2 # Oreochromis niloticus - Nile Tilapia # DATE: 31-Jan-2011 # ORGANISM: Oreochromis niloticus # TAXID: 8128 # ASSEMBLY LONG NAME: Orenil1.1 # ASSEMBLY SHORT NAME: Orenil1.1 # ASSEMBLY SUBMITTER: Broad Institute # ASSEMBLY TYPE: Haploid # NUMBER OF ASSEMBLY-UNITS: 1 # ASSEMBLY ACCESSION: GCA_000188235.2 # FTP-RELEASE DATE: 09-Feb-2012 # rsync://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_other/Oreochromis_niloticus/Orenil1.1/ # Genome ID: # http://www.ncbi.nlm.nih.gov/genome/197 # http://www.ncbi.nlm.nih.gov/bioproject/59571 # http://www.ncbi.nlm.nih.gov/nuccore/320434504 # http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AERX01 # 269x # Taxonomy: # http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=8128 # Mitochondrial sequence: # http://www.ncbi.nlm.nih.gov/bioproject/42843 - chrMt NC_013663.1 # Assembly ID: 354508 # http://www.ncbi.nlm.nih.gov/genome/assembly/354508/ ############################################################################# # fetch sequence from genbank (DONE - 2012-04-10 - Hiram) mkdir -p /hive/data/genomes/genomes/oreNil2/genbank cd /hive/data/genomes/genomes/oreNil2/genbank rsync -a -P \ rsync://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_other/Oreochromis_niloticus/Orenil1.1/ ./ # measure sequence to be used here faSize Primary_Assembly/assembled_chromosomes/FASTA/*.fa.gz \ Primary_Assembly/unplaced_scaffolds/FASTA/*.fa.gz # 927679487 bases (111611440 N's 816068047 real 816068047 upper 0 lower) # in 5677 sequences in 23 files # Total size: mean 163410.2 sd 1922362.4 # min 1000 (gi|320342652|gb|AERX01077754.1|) # max 51042256 (gi|374976115|gb|CM001450.1|) median 2009 ############################################################################# # fixup names for UCSC standards (DONE - 2012-04-10 - Hiram) mkdir /hive/data/genomes/oreNil2/ucsc cd /hive/data/genomes/oreNil2/ucsc ######################## Assembled Chromosomes cat << '_EOF_' > toUcsc.pl #!/bin/env perl use strict; use warnings; my %accToChr; open (FH, "<../genbank/Primary_Assembly/assembled_chromosomes/chr2acc") or die "can not read Primary_Assembly/assembled_chromosomes/chr2acc"; while (my $line = ) { next if ($line =~ m/^#/); chomp $line; my ($chrN, $acc) = split('\s+', $line); $accToChr{$acc} = $chrN; } close (FH); foreach my $acc (keys %accToChr) { my $chrN = $accToChr{$acc}; print "$acc $accToChr{$acc}\n"; open (FH, "zcat ../genbank/Primary_Assembly/assembled_chromosomes/AGP/chr${chrN}.agp.gz|") or die "can not read chr${chrN}.agp.gz"; open (UC, ">chr${chrN}.agp") or die "can not write to chr${chrN}.agp"; while (my $line = ) { if ($line =~ m/^#/) { print UC $line; } else { $line =~ s/^$acc/chr${chrN}/; print UC $line; } } close (FH); close (UC); open (FH, "zcat ../genbank/Primary_Assembly/assembled_chromosomes/FASTA/chr${chrN}.fa.gz|") or die "can not read chr${chrN}.fa.gz"; open (UC, ">chr${chrN}.fa") or die "can not write to chr${chrN}.fa"; while (my $line = ) { if ($line =~ m/^>/) { printf UC ">chr${chrN}\n"; } else { print UC $line; } } close (FH); close (UC); } '_EOF_' # << happy emacs chmod +x toUcsc.pl time ./toUcsc.pl # real 0m33.257s gzip *.fa *.agp faSize chr*.fa.gz # 657350972 bases (50870875 N's 606480097 real 606480097 upper 0 lower) # in 22 sequences in 22 files # Total size: mean 29879589.6 sd 7407420.0 min 17092887 (chrLG10) # max 51042256 (chrLG7) median 31194787 ######################## Unplaced scaffolds # verify we don't have any .acc numbers different from .1 zcat \ ../genbank/Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz \ | cut -f1 | egrep "^GL|^AERX" \ | sed -e 's/^GL[0-9][0-9]*//; s/^AERX[0-9][0-9]*//' | sort | uniq -c # 71382 .1 # 1737 .2 # in that case, since they are not all .1, we need to use the full name # modified to remove the . # this is like the unplaced.pl script in other assemblies except it # does not add chrUn_ to the names since they are all just scaffolds cat << '_EOF_' > unplaced.pl #!/bin/env perl use strict; use warnings; my $agpFile = "../genbank/Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz"; my $fastaFile = "../genbank/Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz"; open (FH, "zcat $agpFile|") or die "can not read $agpFile"; open (UC, "|gzip -c > unplaced.agp.gz") or die "can not write to unplaced.agp.gz"; while (my $line = ) { if ($line =~ m/^#/) { print UC $line; } else { $line =~ s/\./-/; printf UC "%s", $line; } } close (FH); close (UC); open (FH, "zcat $fastaFile|") or die "can not read $fastaFile"; open (UC, "|gzip -c > unplaced.fa.gz") or die "can not write to unplaced.fa.gz"; while (my $line = ) { if ($line =~ m/^>/) { chomp $line; $line =~ s/.*gb\|//; $line =~ s/\|.*//; $line =~ s/\./-/; printf UC ">$line\n"; } else { print UC $line; } } close (FH); close (UC); '_EOF_' # << happy emacs chmod +x unplaced.pl time ./unplaced.pl # real 1m5.894s # make sure none of the names got to be over 31 characers long: zcat unplaced.agp | grep -v "^#" | cut -f1 | awk '{print length($0)}' \ | sort -rn | uniq -c | head # 4077 14 # 69042 10 faSize unplaced.fa.gz # 270328515 bases (60740565 N's 209587950 real 209587950 upper 0 lower) # in 5655 sequences in 1 files # Total size: mean 47803.5 sd 238106.9 min 1000 (AERX01077754-1) # max 7888616 (GL831144-1) median 1992 # verify we have the same sequence as delivered from genbank: faSize *.fa.gz # 927679487 bases (111611440 N's 816068047 real 816068047 upper 0 lower) # in 5677 sequences in 23 files # Total size: mean 163410.2 sd 1922362.4 min 1000 (AERX01077754-1) # max 51042256 (chrLG7) median 2009 # from genbank: # 927679487 bases (111611440 N's 816068047 real 816068047 upper 0 lower) # in 5677 sequences in 23 files # Total size: mean 163410.2 sd 1922362.4 # min 1000 (gi|320342652|gb|AERX01077754.1|) # max 51042256 (gi|374976115|gb|CM001450.1|) median 2009 ############################################################################# # Initial database build (DONE - 2012-04-10 - Hiram) cd /hive/data/genomes/oreNil2 cat << '_EOF_' > oreNil2.config.ra # Config parameters for makeGenomeDb.pl: db oreNil2 clade vertebrate genomeCladePriority 85 scientificName Oreochromis niloticus commonName Nile tilapia assemblyDate Jan. 2011 assemblyLabel Broad Institute of MIT and Harvard Orenil1.1 (GCA_000188235.1) assemblyShortLabel Nile tilapia ncbiAssemblyName Orenil1.1 ncbiAssemblyId 354508 orderKey 4679 mitoAcc NC_013663 fastaFiles /cluster/data/oreNil2/ucsc/*.fa.gz agpFiles /cluster/data/oreNil2/ucsc/*.agp.gz dbDbSpeciesDir oreNil taxId 8128 '_EOF_' # << happy emacs # verify sequence and agp are OK time makeGenomeDb.pl -workhorse=hgwdev -fileServer=hgwdev -dbHost=hgwdev \ -stop=agp oreNil2.config.ra > agp.log 2>&1 # real 1m6.870s # verify OK: tail -1 agp.log # *** All done! (through the 'agp' step) time makeGenomeDb.pl -continue=db oreNil2.config.ra > db.log 2>&1 # real 8m45.076s ########################################################################## # running repeat masker (DONE - 2012-04-10 - Hiram) mkdir /hive/data/genomes/oreNil2/bed/repeatMasker cd /hive/data/genomes/oreNil2/bed/repeatMasker time doRepeatMasker.pl -buildDir=`pwd` -noSplit \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ -smallClusterHub=encodek oreNil2 > do.log 2>&1 & # real 67m27.295s cat faSize.rmsk.txt # 927696114 bases (111611440 N's 816084674 real 786292709 upper # 29791965 lower) in 5678 sequences in 1 files # Total size: mean 163384.3 sd 1922194.0 min 1000 (AERX01077754-1) # max 51042256 (chrLG7) median 2009 # %3.21 masked total, %3.65 masked real egrep -i "versi|relea" do.log # April 26 2011 (open-3-3-0) version of RepeatMasker # CC RELEASE 20110920; # RepeatMasker version development-$Id: RepeatMasker,v 1.26 2011/09/26 16:19:44 angie Exp $ featureBits -countGaps oreNil2 rmsk # 30137751 bases of 927696114 (3.249%) in intersection # why is it different than the faSize above ? # because rmsk masks out some N's as well as bases, the count above # separates out the N's from the bases, it doesn't show lower case N's ########################################################################## # running simple repeat (DONE - 2012-04-10 - Hiram) mkdir /hive/data/genomes/oreNil2/bed/simpleRepeat cd /hive/data/genomes/oreNil2/bed/simpleRepeat time doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=swarm \ -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=encodek \ oreNil2 > do.log 2>&1 & # real 8m53.572s cat fb.simpleRepeat # 7960749 bases of 866932349 (0.918%) in intersection # add to rmsk after it is done: cd /hive/data/genomes/oreNil2 twoBitMask oreNil2.rmsk.2bit \ -add bed/simpleRepeat/trfMask.bed oreNil2.2bit # you can safely ignore the warning about fields >= 13 twoBitToFa oreNil2.2bit stdout | faSize stdin > faSize.oreNil2.2bit.txt cat faSize.oreNil2.2bit.txt # 1511735326 bases (153400444 N's 1358334882 real 1024824487 upper # 333510395 lower) in 19550 sequences in 1 files # %22.06 masked total, %24.55 masked real rm /gbdb/oreNil2/oreNil2.2bit ln -s `pwd`/oreNil2.2bit /gbdb/oreNil2/oreNil2.2bit ######################################################################### # Verify all gaps are marked, add any N's not in gap as type 'other' # (DONE - 2012-04-11 - Hiram) mkdir /hive/data/genomes/oreNil2/bed/gap cd /hive/data/genomes/oreNil2/bed/gap time nice -n +19 findMotif -motif=gattaca -verbose=4 \ -strand=+ ../../oreNil2.unmasked.2bit > findMotif.txt 2>&1 # real 3m21.738s grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed featureBits oreNil2 -not gap -bed=notGap.bed # 866932349 bases of 866932349 (100.000%) in intersection time featureBits oreNil2 allGaps.bed notGap.bed -bed=new.gaps.bed # 50847675 bases of 866932349 (5.865%) in intersection # real 2m16.940s # what is the highest index in the existing gap table: hgsql -N -e "select ix from gap;" oreNil2 | sort -n | tail -1 # 1004 cat << '_EOF_' > mkGap.pl #!/bin/env perl use strict; use warnings; my $ix=`hgsql -N -e "select ix from gap;" oreNil2 | sort -n | tail -1`; chomp $ix; open (FH,") { my ($chrom, $chromStart, $chromEnd, $rest) = split('\s+', $line); ++$ix; printf "%s\t%d\t%d\t%d\tN\t%d\tother\tyes\n", $chrom, $chromStart, $chromEnd, $ix, $chromEnd-$chromStart; } close (FH); '_EOF_' # << happy emacs chmod +x ./mkGap.pl ./mkGap.pl > other.bed featureBits -countGaps oreNil2 other.bed # 50847675 bases of 927696114 (5.481%) in intersection wc -l other.bed # 38113 hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/gap.sql \ -noLoad oreNil2 otherGap other.bed # starting with this many hgsql -e "select count(*) from gap;" oreNil2 # 33964 hgsql oreNil2 -e 'load data local infile "bed.tab" into table gap;' # result count: hgsql -e "select count(*) from gap;" oreNil2 # 72077 # == 33964 + 38113 # verify we aren't adding gaps where gaps already exist # this would output errors if that were true: gapToLift -minGap=1 oreNil2 nonBridged.lift -bedFile=nonBridged.bed # see example in danRer7.txt # there are no non-bridged gaps here: hgsql -N -e "select bridge from gap;" oreNil2 | sort | uniq -c # 232 no # 71845 yes ########################################################################## ## WINDOWMASKER (DONE - 2012-04-10 - Hiram) mkdir /hive/data/genomes/oreNil2/bed/windowMasker cd /hive/data/genomes/oreNil2/bed/windowMasker time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \ -dbHost=hgwdev oreNil2 > do.log 2>&1 & # real 66m1.067s # Masking statistics twoBitToFa oreNil2.wmsk.2bit stdout | faSize stdin # 927696114 bases (111611440 N's 816084674 real 612807423 upper # 203277251 lower) in 5678 sequences in 1 files # Total size: mean 163384.3 sd 1922194.0 min 1000 (AERX01077754-1) # max 51042256 (chrLG7) median 2009 # %21.91 masked total, %24.91 masked real twoBitToFa oreNil2.wmsk.sdust.2bit stdout | faSize stdin # 927696114 bases (111611440 N's 816084674 real 608110449 upper # 207974225 lower) in 5678 sequences in 1 files # Total size: mean 163384.3 sd 1922194.0 min 1000 (AERX01077754-1) # max 51042256 (chrLG7) median 2009 # %22.42 masked total, %25.48 masked real hgLoadBed oreNil2 windowmaskerSdust windowmasker.sdust.bed.gz # Loaded 5157020 elements of size 3 featureBits -countGaps oreNil2 windowmaskerSdust # 319585665 bases of 927696114 (34.449%) in intersection # eliminate the gaps from the masking featureBits oreNil2 -not gap -bed=notGap.bed # 816084674 bases of 816084674 (100.000%) in intersection time nice -n +19 featureBits oreNil2 windowmaskerSdust notGap.bed \ -bed=stdout | gzip -c > cleanWMask.bed.gz # 207974225 bases of 816084674 (25.484%) in intersection # real 1m44.114s # reload track to get it clean hgLoadBed oreNil2 windowmaskerSdust cleanWMask.bed.gz # Loaded 5181510 elements of size 4 featureBits -countGaps oreNil2 windowmaskerSdust # 207974225 bases of 927696114 (22.418%) in intersection zcat cleanWMask.bed.gz \ | twoBitMask ../../oreNil2.unmasked.2bit stdin \ -type=.bed oreNil2.cleanWMSdust.2bit twoBitToFa oreNil2.cleanWMSdust.2bit stdout | faSize stdin \ > oreNil2.cleanWMSdust.faSize.txt cat oreNil2.cleanWMSdust.faSize.txt # 927696114 bases (111611440 N's 816084674 real 608110449 upper # 207974225 lower) in 5678 sequences in 1 files # Total size: mean 163384.3 sd 1922194.0 min 1000 (AERX01077754-1) # max 51042256 (chrLG7) median 2009 # %22.42 masked total, %25.48 masked real # how much does this window masker and repeat masker overlap: featureBits -countGaps oreNil2 rmsk windowmaskerSdust # 21445936 bases of 927696114 (2.312%) in intersection ######################################################################### # MASK SEQUENCE WITH WM+TRF (DONE - 2012-04-11 - Hiram) # since rmsk masks so very little of the genome, use the clean WM mask # on the genome sequence cd /hive/data/genomes/oreNil2 twoBitMask -add bed/windowMasker/oreNil2.cleanWMSdust.2bit \ bed/simpleRepeat/trfMask.bed oreNil2.2bit # safe to ignore the warnings about BED file with >=13 fields twoBitToFa oreNil2.2bit stdout | faSize stdin > faSize.oreNil2.txt cat faSize.oreNil2.txt # 927696114 bases (111611440 N's 816084674 real 607935500 upper # 208149174 lower) in 5678 sequences in 1 files # Total size: mean 163384.3 sd 1922194.0 min 1000 (AERX01077754-1) # max 51042256 (chrLG7) median 2009 # %22.44 masked total, %25.51 masked real # create symlink to gbdb ssh hgwdev rm /gbdb/oreNil2/oreNil2.2bit ln -s `pwd`/oreNil2.2bit /gbdb/oreNil2/oreNil2.2bit ######################################################################## # cpgIslands - (DONE - 2011-04-20 - Hiram) mkdir /hive/data/genomes/oreNil2/bed/cpgIslands cd /hive/data/genomes/oreNil2/bed/cpgIslands time doCpgIslands.pl oreNil2 > do.log 2>&1 # Elapsed time: 11m10s time featureBits oreNil2 cpgIslandExt > fb.oreNil2.cpgIslandExt.txt 2>&1 # 6076152 bases of 816084674 (0.745%) in intersection # real 0m2.342s ######################################################################### # genscan - (DONE - 2011-04-26 - Hiram) mkdir /hive/data/genomes/oreNil2/bed/genscan cd /hive/data/genomes/oreNil2/bed/genscan time doGenscan.pl oreNil2 > do.log 2>&1 # real 26m8.894s # three broken jobs rerunning at window size of 2000000: ./runGsBig.csh chrLG12 000 gtf/000/chrLG12.gtf pep/000/chrLG12.pep subopt/000/chrLG12.bed & ./runGsBig.csh chrLG20 000 gtf/000/chrLG20.gtf pep/000/chrLG20.pep subopt/000/chrLG20.bed & ./runGsBig.csh chrLG22 000 gtf/000/chrLG22.gtf pep/000/chrLG22.pep subopt/000/chrLG22.bed # real 13m41.480s # continuing: time doGenscan.pl -continue=makeBed oreNil2 > makeBed.log 2>&1 # Elapsed time: 9m20s cat fb.oreNil2.genscan.txt # 38275835 bases of 816084674 (4.690%) in intersection cat fb.oreNil2.genscanSubopt.txt # 28219603 bases of 816084674 (3.458%) in intersection ######################################################################### # MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2012-05-03 - Hiram) # Use -repMatch=900, based on size -- for human we use 1024 # use the "real" number from the faSize measurement, # hg19 is 2897316137, calculate the ratio factor for 1024: calc $ 816084674 / 2897316137 $ \* 1024 # ( 816084674 / 2897316137 ) * 1024 = 288.429245 # round up to 300 cd /hive/data/genomes/oreNil2 time blat oreNil2.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=jkStuff/oreNil2.11.ooc -repMatch=300 # Wrote 16614 overused 11-mers to jkStuff/oreNil2.11.ooc # real 0m28.015s # there are non-bridged gaps, construct lift file needed for genbank hgsql -N -e "select bridge from gap;" oreNil2 | sort | uniq -c # 232 no # 71845 yes cd /hive/data/genomes/oreNil2/jkStuff gapToLift oreNil2 oreNil2.nonBridged.lift -bedFile=oreNil2.nonBridged.bed # largest non-bridged contig: awk '{print $3-$2,$0}' oreNil2.nonBridged.bed | sort -nr | head # 13623339 chrLG1 17571448 31194787 chrLG1.07 ######################################################################### # AUTO UPDATE GENBANK (DONE - 2012-05-03 - Hiram) # examine the file: /cluster/data/genbank/data/organism.lst # for your species to see what counts it has for: # organism mrnaCnt estCnt refSeqCnt # Oreochromis niloticus 351 120993 0 # to decide which "native" mrna or ests you want to specify in genbank.conf ssh hgwdev cd $HOME/kent/src/hg/makeDb/genbank git pull # edit etc/genbank.conf to add oreNil2 just after ce2 # oreNil2 (Nile tilapia) oreNil2.serverGenome = /hive/data/genomes/oreNil2/oreNil2.2bit oreNil2.clusterGenome = /hive/data/genomes/oreNil2/oreNil2.2bit oreNil2.ooc = /hive/data/genomes/oreNil2/jkStuff/oreNil2.11.ooc oreNil2.lift = /hive/data/genomes/oreNil2/jkStuff/oreNil2.nonBridged.lift oreNil2.refseq.mrna.native.pslCDnaFilter = ${lowCover.refseq.mrna.native.pslCDnaFilter} oreNil2.refseq.mrna.xeno.pslCDnaFilter = ${lowCover.refseq.mrna.xeno.pslCDnaFilter} oreNil2.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter} oreNil2.genbank.mrna.xeno.pslCDnaFilter = ${lowCover.genbank.mrna.xeno.pslCDnaFilter} oreNil2.genbank.est.native.pslCDnaFilter = ${lowCover.genbank.est.native.pslCDnaFilter} oreNil2.refseq.mrna.native.load = yes oreNil2.refseq.mrna.xeno.load = yes oreNil2.genbank.mrna.xeno.load = yes oreNil2.genbank.est.native.load = yes oreNil2.downloadDir = oreNil2 oreNil2.perChromTables = no # end of section added to etc/genbank.conf git commit -m "adding oreNil2 Nile tilapia" etc/genbank.conf git push make etc-update git pull # Edit src/lib/gbGenome.c to add new species. git commit -m "adding definition for oreNilNames" \ src/lib/gbGenome.c git push make install-server ssh hgwdev # used to do this on "genbank" machine screen -S oreNil2 # long running job managed in screen cd /cluster/data/genbank time nice -n +19 ./bin/gbAlignStep -initial oreNil2 & # var/build/logs/2012.06.08-09:52:19.oreNil2.initalign.log # real 1252m57.323s # load database when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad oreNil2 & # logFile: var/dbload/hgwdev/logs/2012.06.11-10:16:13.dbload.log # real 30m6.085s # enable daily alignment and update of hgwdev (DONE - 2012-05-11 - Hiram) cd ~/kent/src/hg/makeDb/genbank git pull # add oreNil2 to: vi etc/align.dbs etc/hgwdev.dbs git commit -m "Added oreNil2." etc/align.dbs etc/hgwdev.dbs git push make etc-update ######################################################################### # set default position to RHO gene displays (DONE - 2012-07-24 - Hiram) hgsql -e \ 'update dbDb set defaultPos="chrLG5:26068730-26095423" where name="oreNil2";' \ hgcentraltest ############################################################################ # pushQ entry (DONE - 2012-07-24 - Hiram) mkdir /hive/data/genomes/oreNil2/pushQ cd /hive/data/genomes/oreNil2/pushQ # Mark says don't let the transMap track get there time makePushQSql.pl oreNil2 2> stderr.txt | grep -v transMap > oreNil2.sql # real 3m31.947s scp -p oreNil2.sql hgwbeta:/tmp ssh hgwbeta "hgsql qapushq < /tmp/oreNil2.sql" ############################################################################