# for emacs: -*- mode: sh; -*- # This file describes browser build for the latCha1 # Coelacanth - Latimeria chalumnae - Aug 2011 # photoCreditURL http://en.wikipedia.org/wiki/File:Coelacanth.png # photoCreditName Robbie Cada # ncbiGenomeId 3262 # ncbiAssemblyId 303548 # ncbiAssemblyName Broad LatCha1 # ncbiBioProject 56111 # genBankAccessionID GCA_000225785.1 # http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AFYH00 # 77.5X coverage WGS # http://www.ncbi.nlm.nih.gov/bioproject/56111 ############################################################################# # Fetch sequence from genbank (DONE - 2012-01-06 - Hiram) mkdir -p /hive/data/genomes/latCha1/genbank cd /hive/data/genomes/latCha1/genbank time wget --timestamping -r --cut-dirs=6 --level=0 -nH -x \ --no-remove-listing -np \ "ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_other/Latimeria_chalumnae/LatCha1/*" # Downloaded: 13 files, 807M in 11m 36s (1.16 MB/s) # real 11m41.968s # measure sequence to be used here faSize Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz # 2860575514 bases (676999153 N's 2183576361 real 2183576361 upper 0 # lower) in 22818 sequences in 1 files # Total size: mean 125364.9 sd 395789.0 # min 1000 (gi|345908326|gb|AFYH01291818.1|) # max 10736886 (gi|346342275|gb|JH126562.1|) median 5475 ############################################################################# # process into UCSC naming scheme (DONE - 2012-01-11 - Hiram) mkdir /hive/data/genomes/latCha1/ucsc cd /hive/data/genomes/latCha1/ucsc # verify we don't have any .acc numbers different from .1 zcat ../genbank/Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz \ | cut -f1 | egrep "^JH|AFYH" \ | sed -e 's/^JH[0-9][0-9]*//; s/^AFYH[0-9][0-9]*//' | sort | uniq -c # 560838 .1 # this is like the unplaced.pl script in other assemblies except it # does not add chrUn_ to the names since they are all just scaffolds cat << '_EOF_' > unplaced.pl #!/bin/env perl use strict; use warnings; my $agpFile = "../genbank/Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz"; my $fastaFile = "../genbank/Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz"; open (FH, "zcat $agpFile|") or die "can not read $agpFile"; open (UC, ">unplaced.agp") or die "can not write to unplaced.agp"; while (my $line = ) { if ($line =~ m/^#/) { print UC $line; } else { $line =~ s/\.1//; printf UC "%s", $line; } } close (FH); close (UC); open (FH, "zcat $fastaFile|") or die "can not read $fastaFile"; open (UC, ">unplaced.fa") or die "can not write to unplaced.fa"; while (my $line = ) { if ($line =~ m/^>/) { chomp $line; $line =~ s/.*gb\|//; $line =~ s/\.1\|.*//; printf UC ">$line\n"; } else { print UC $line; } } close (FH); close (UC); '_EOF_' # << happy emacs chmod +x unplaced.pl time ./unplaced.pl # real 1m4.707s time gzip *.fa *.agp # real 11m4.518s # -rw-rw-r-- 1 27747518 Jan 11 10:57 unplaced.agp # -rw-rw-r-- 1 2901727727 Jan 11 10:58 unplaced.fa # verify nothing lost in the translation, should be the same as above # except for the name translations faSize *.fa.gz # 2860575514 bases (676999153 N's 2183576361 real 2183576361 upper 0 lower) in # 22818 sequences in 1 files # Total size: mean 125364.9 sd 395789.0 min 1000 (AFYH01291818) # max 10736886 (JH126562) median 5475 ############################################################################# # Initial browser build (DONE - 2012-01-11 - Hiram) cd /hive/data/genomes/latCha1 cat << '_EOF_' > latCha1.config.ra # Config parameters for makeGenomeDb.pl: db latCha1 clade vertebrate genomeCladePriority 82 scientificName Latimeria chalumnae commonName Coelacanth assemblyDate Aug. 2011 assemblyLabel Broad LatCha1 (GCA_000225785.1) assemblyShortLabel LatCha1 orderKey 447 mitoAcc NC_001804 fastaFiles /hive/data/genomes/latCha1/ucsc/*.fa.gz agpFiles /hive/data/genomes/latCha1/ucsc/*.agp.gz dbDbSpeciesDir latCha taxId 7897 '_EOF_' # << happy emacs time makeGenomeDb.pl -stop=agp latCha1.config.ra > agp.log 2>&1 # real 5m52.756s # check the end of agp.log to verify it is OK time makeGenomeDb.pl -workhorse=hgwdev -fileServer=hgwdev \ -continue=db latCha1.config.ra > db.log 2>&1 # real 16m19.040s ############################################################################# # running repeat masker (DONE - 2012-01-11 - Hiram) mkdir /hive/data/genomes/latCha1/bed/repeatMasker cd /hive/data/genomes/latCha1/bed/repeatMasker time doRepeatMasker.pl -buildDir=`pwd` -noSplit \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ -smallClusterHub=memk latCha1 > do.log 2>&1 & # real 250m27.510s cat faSize.rmsk.txt # 2860591921 bases (676999153 N's 2183592768 real 1981827417 upper # 201765351 lower) in 22819 sequences in 1 files # Total size: mean 125360.1 sd 395781.0 min 1000 (AFYH01291818) # max 10736886 (JH126562) median 5475 # %7.05 masked total, %9.24 masked real grep -i versi do.log # RepeatMasker version development-$Id: RepeatMasker,v 1.26 2011/09/26 16:19:44 angie Exp $ # April 26 2011 (open-3-3-0) version of RepeatMasker featureBits -countGaps latCha1 rmsk # 201917583 bases of 2860591921 (7.059%) in intersection # why is it different than the faSize above ? # because rmsk masks out some N's as well as bases, the count above # separates out the N's from the bases, it doesn't show lower case N's ########################################################################## # running simple repeat (DONE - 2012-01-11 - Hiram) mkdir /hive/data/genomes/latCha1/bed/simpleRepeat cd /hive/data/genomes/latCha1/bed/simpleRepeat time doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=swarm \ -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=memk \ latCha1 > do.log 2>&1 & # real 43m42.360s cat fb.simpleRepeat # 33730886 bases of 2183592768 (1.545%) in intersection # not going to add to rmsk here, using the window masker instead since # it masks more sequence cd /hive/data/genomes/latCha1 twoBitMask latCha1.rmsk.2bit \ -add bed/simpleRepeat/trfMask.bed latCha1.2bit # you can safely ignore the warning about fields >= 13 twoBitToFa latCha1.2bit stdout | faSize stdin > faSize.latCha1.2bit.txt cat faSize.latCha1.2bit.txt # 2608572064 bases (131440969 N's 2477131095 real 1320629270 upper # 1156501825 lower) in 2685 sequences in 1 files # Total size: mean 971535.2 sd 4827933.6 min 1003 (AGCE01151413) # max 72162052 (JH378105) median 1553 rm /gbdb/latCha1/latCha1.2bit ln -s `pwd`/latCha1.2bit /gbdb/latCha1/latCha1.2bit ######################################################################### # Verify all gaps are marked, add any N's not in gap as type 'other' # (DONE - 2012-01-11 - Hiram) mkdir /hive/data/genomes/latCha1/bed/gap cd /hive/data/genomes/latCha1/bed/gap time nice -n +19 findMotif -motif=gattaca -verbose=4 \ -strand=+ ../../latCha1.unmasked.2bit > findMotif.txt 2>&1 # real 0m27.822s grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed time featureBits -countGaps latCha1 -not gap -bed=notGap.bed # 2183592768 bases of 2860591921 (76.334%) in intersection # real 0m22.615s time featureBits -countGaps latCha1 allGaps.bed notGap.bed -bed=new.gaps.bed # 0 bases of 2860591921 (0.000%) in intersection # real 28m7.467s # no unannotated gaps, no more to run here # there are no non-bridged gaps here: hgsql -N -e "select bridge from gap;" latCha1 | sort | uniq -c # 269010 yes ########################################################################## ## WINDOWMASKER (DONE - 2012-01-11 - Hiram) mkdir /hive/data/genomes/latCha1/bed/windowMasker cd /hive/data/genomes/latCha1/bed/windowMasker time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \ -dbHost=hgwdev latCha1 > do.log 2>&1 & # real 275m14.802s # Masking statistics twoBitToFa latCha1.wmsk.2bit stdout | faSize stdin # 2860591921 bases (676999153 N's 2183592768 real 1197030078 upper # 986562690 lower) in 22819 sequences in 1 files # Total size: mean 125360.1 sd 395781.0 min 1000 (AFYH01291818) # max 10736886 (JH126562) median 5475 # %34.49 masked total, %45.18 masked real twoBitToFa latCha1.wmsk.sdust.2bit stdout | faSize stdin # 2860591921 bases (676999153 N's 2183592768 real 1186196719 upper # 997396049 lower) in 22819 sequences in 1 files # Total size: mean 125360.1 sd 395781.0 min 1000 (AFYH01291818) # max 10736886 (JH126562) median 5475 # %34.87 masked total, %45.68 masked real hgLoadBed latCha1 windowmaskerSdust windowmasker.sdust.bed.gz # Loaded 14478353 elements of size 3 featureBits -countGaps latCha1 windowmaskerSdust # 1674395202 bases of 2860591921 (58.533%) in intersection # eliminate the gaps from the masking featureBits latCha1 -not gap -bed=notGap.bed # 2183592768 bases of 2183592768 (100.000%) in intersection time nice -n +19 featureBits latCha1 windowmaskerSdust notGap.bed \ -bed=stdout | gzip -c > cleanWMask.bed.gz # 997396049 bases of 2183592768 (45.677%) in intersection # real 16m48.600s # reload track to get it clean hgLoadBed latCha1 windowmaskerSdust cleanWMask.bed.gz # Loaded 14630157 elements of size 4 time featureBits -countGaps latCha1 windowmaskerSdust # 997396049 bases of 2860591921 (34.867%) in intersection # real 2m47.260s # mask with this clean result zcat cleanWMask.bed.gz \ | twoBitMask ../../latCha1.unmasked.2bit stdin \ -type=.bed latCha1.cleanWMSdust.2bit twoBitToFa latCha1.cleanWMSdust.2bit stdout | faSize stdin \ > latCha1.cleanWMSdust.faSize.txt cat latCha1.cleanWMSdust.faSize.txt # 2860591921 bases (676999153 N's 2183592768 real 1186196719 upper # 997396049 lower) in 22819 sequences in 1 files # Total size: mean 125360.1 sd 395781.0 min 1000 (AFYH01291818) # max 10736886 (JH126562) median 5475 # %34.87 masked total, %45.68 masked real # how much does this window masker and repeat masker overlap: featureBits -countGaps latCha1 rmsk windowmaskerSdust # 177085590 bases of 2860591921 (6.191%) in intersection ######################################################################### # MASK SEQUENCE WITH WM+TRF (DONE - 2012-01-12 - Hiram) cd /hive/data/genomes/latCha1 twoBitMask -add bed/windowMasker/latCha1.cleanWMSdust.2bit \ bed/simpleRepeat/trfMask.bed latCha1.2bit # safe to ignore the warnings about BED file with >=13 fields twoBitToFa latCha1.2bit stdout | faSize stdin > faSize.latCha1.txt cat faSize.latCha1.txt # 2860591921 bases (676999153 N's 2183592768 real 1186074582 upper # 997518186 lower) in 22819 sequences in 1 files # Total size: mean 125360.1 sd 395781.0 min 1000 (AFYH01291818) # max 10736886 (JH126562) median 5475 # %34.87 masked total, %45.68 masked real # create symlink to gbdb ssh hgwdev rm /gbdb/latCha1/latCha1.2bit ln -s `pwd`/latCha1.2bit /gbdb/latCha1/latCha1.2bit ######################################################################### # cpgIslands - (DONE - 2011-04-23 - Hiram) mkdir /hive/data/genomes/latCha1/bed/cpgIslands cd /hive/data/genomes/latCha1/bed/cpgIslands time doCpgIslands.pl latCha1 > do.log 2>&1 # Elapsed time: 75m3s cat fb.latCha1.cpgIslandExt.txt # 3128859 bases of 2183592768 (0.143%) in intersection ######################################################################### # genscan - (DONE - 2011-04-26 - Hiram) mkdir /hive/data/genomes/latCha1/bed/genscan cd /hive/data/genomes/latCha1/bed/genscan time doGenscan.pl latCha1 > do.log 2>&1 # Elapsed time: 107m4s cat fb.latCha1.genscan.txt # 34776149 bases of 2183592768 (1.593%) in intersection cat fb.latCha1.genscanSubopt.txt # 39383851 bases of 2183592768 (1.804%) in intersection ######################################################################### # MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2012-05-03 - Hiram) # Use -repMatch=900, based on size -- for human we use 1024 # use the "real" number from the faSize measurement, # hg19 is 2897316137, calculate the ratio factor for 1024: calc \( 2183592768 / 2897316137 \) \* 1024 # ( 2183592768 / 2897316137 ) * 1024 = 771.748366 # round up to 800 cd /hive/data/genomes/latCha1 time blat latCha1.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=jkStuff/latCha1.11.ooc -repMatch=800 # Wrote 28363 overused 11-mers to jkStuff/latCha1.11.ooc # real 0m39.722s # there are no non-bridged gaps, no lift file needed for genbank hgsql -N -e "select bridge from gap;" latCha1 | sort | uniq -c # 269010 yes # cd /hive/data/genomes/latCha1/jkStuff # gapToLift latCha1 latCha1.nonBridged.lift -bedFile=latCha1.nonBridged.bed # largest non-bridged contig: # awk '{print $3-$2,$0}' latCha1.nonBridged.bed | sort -nr | head # 123773608 chrX 95534 123869142 chrX.01 ######################################################################### # AUTO UPDATE GENBANK (DONE - 2012-05-03 - Hiram) # examine the file: /cluster/data/genbank/data/organism.lst # for your species to see what counts it has for: # organism mrnaCnt estCnt refSeqCnt # Latimeria chalumnae 47 0 0 # to decide which "native" mrna or ests you want to specify in genbank.conf ssh hgwdev cd $HOME/kent/src/hg/makeDb/genbank git pull # edit etc/genbank.conf # latCha1 (Coelacanth) latCha1.serverGenome = /hive/data/genomes/latCha1/latCha1.2bit latCha1.clusterGenome = /hive/data/genomes/latCha1/latCha1.2bit latCha1.ooc = /hive/data/genomes/latCha1/jkStuff/latCha1.11.ooc latCha1.lift = no latCha1.refseq.mrna.native.pslCDnaFilter = ${lowCover.refseq.mrna.native.pslCDnaFilter} latCha1.refseq.mrna.xeno.pslCDnaFilter = ${lowCover.refseq.mrna.xeno.pslCDnaFilter} latCha1.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter} latCha1.genbank.mrna.xeno.pslCDnaFilter = ${lowCover.genbank.mrna.xeno.pslCDnaFilter} latCha1.genbank.est.native.pslCDnaFilter = ${lowCover.genbank.est.native.pslCDnaFilter} latCha1.refseq.mrna.native.load = no latCha1.refseq.mrna.xeno.load = yes latCha1.genbank.mrna.xeno.load = yes latCha1.genbank.est.native.load = no latCha1.downloadDir = latCha1 latCha1.perChromTables = no # end of section added to etc/genbank.conf git commit -m "adding latCha1 Coelacanth" etc/genbank.conf git push make etc-update git pull # Edit src/lib/gbGenome.c to add new species. git commit -m "adding definition for latChaNames" \ src/lib/gbGenome.c git push make install-server ssh hgwdev # used to do this on "genbank" machine screen -S latCha1 # long running job managed in screen cd /cluster/data/genbank time nice -n +19 ./bin/gbAlignStep -initial latCha1 & # var/build/logs/2012.05.08-11:55:55.latCha1.initalign.log # real 2310m21.398s # load database when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad latCha1 & # logFile: var/dbload/hgwdev/logs/2012.05.09-16:06:10.dbload.log # real 85m52.881s # enable daily alignment and update of hgwdev cd ~/kent/src/hg/makeDb/genbank git pull # add latCha1 to: vi etc/align.dbs etc/hgwdev.dbs git commit -m "Added latCha1." etc/align.dbs etc/hgwdev.dbs git push make etc-update ######################################################################### # photograph (DONE - 2012-06-2012 - Hiram) # note at http://en.wikipedia.org/wiki/File:Coelacanth.png # This work has been released into the public domain by # its author, Robbie Cada. This applies worldwide. mkdir /hive/data/genomes/latCha1/photograph wget --timestamping \ http://upload.wikimedia.org/wikipedia/commons/4/4b/Coelacanth.png convert -geometry "400x200" Coelacanth.png "Latimeria_chalumnae.png" # add Latimeria_chalumnae.png to the source tree src/hg/htdocs/images/ ######################################################################### # set default position to RHO gene displays (DONE - 2012-07-23 - Hiram) hgsql -e \ 'update dbDb set defaultPos="JH126819:1630199-1640227" where name="latCha1";' \ hgcentraltest ############################################################################ # pushQ entry (DONE - 2012-07-23 - Hiram) mkdir /hive/data/genomes/latCha1/pushQ cd /hive/data/genomes/latCha1/pushQ # Mark says don't let the transMap track get there time makePushQSql.pl latCha1 2> stderr.txt | grep -v transMap > latCha1.sql # real 4m0.069s scp -p latCha1.sql hgwbeta:/tmp ssh hgwbeta cd /tmp hgsql qapushq < latCha1.sql ############################################################################