# for emacs: -*- mode: sh; -*- # This file describes browser build for the dasNov3 # Dasypus novemcinctus - armadillo # DATE: 01-Dec-2011 # ORGANISM: Dasypus novemcinctus # TAXID: 9361 # ASSEMBLY LONG NAME: Dasnov3.0 # ASSEMBLY SHORT NAME: Dasnov3.0 # ASSEMBLY SUBMITTER: Baylor College of Medicine # ASSEMBLY TYPE: Haploid # NUMBER OF ASSEMBLY-UNITS: 1 # ASSEMBLY ACCESSION: GCA_000208655.2 # FTP-RELEASE DATE: 07-Jan-2012 # rsync://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Dasypus_novemcinctus/Dasnov3.0/ ./ # Genome ID: # http://www.ncbi.nlm.nih.gov/genome/235 # http://www.ncbi.nlm.nih.gov/bioproject/12594 # http://www.ncbi.nlm.nih.gov/nuccore/AAGV00000000.3 # http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AAGV03 # Genome Coverage : 6x # Taxonomy: # http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=9361 # Mitochondrial sequence: # http://www.ncbi.nlm.nih.gov/bioproject/11837 - chrMt NC_001821.1 # Assembly ID: 326198 # http://www.ncbi.nlm.nih.gov/genome/assembly/326198/ ############################################################################# # fetch sequence from genbank (DONE - 2012-04-11 - Hiram) mkdir -p /hive/data/genomes/genomes/dasNov3/genbank cd /hive/data/genomes/genomes/dasNov3/genbank rsync -a -P \ rsync://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Dasypus_novemcinctus/Dasnov3.0/ ./ # measure sequence to be used here faSize Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz # 3631505655 bases (331640683 N's 3299864972 real 3299864972 upper # 0 lower) in 46558 sequences in 1 files # Total size: mean 77999.6 sd 423977.1 # min 200 (gi|367146187|gb|AAGV03313855.1|) # max 14349515 (gi|371469823|gb|JH573670.1|) median 3708 ############################################################################# # fixup names for UCSC standards (DONE - 2012-04-11 - Hiram) mkdir /hive/data/genomes/dasNov3/ucsc cd /hive/data/genomes/dasNov3/ucsc ######################## Unplaced scaffolds # verify we don't have any .acc numbers different from .1 zcat \ ../genbank/Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz \ | cut -f1 | egrep "^JH|^AAGV" \ | sed -e 's/^JH[0-9][0-9]*//; s/^AAGV[0-9][0-9]*//' | sort | uniq -c # 583384 .1 # in that case, since they are all .1, we can remove them # this is like the unplaced.pl script in other assemblies except it # does not add chrUn_ to the names since they are all just scaffolds cat << '_EOF_' > unplaced.pl #!/bin/env perl use strict; use warnings; my $agpFile = "../genbank/Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz"; my $fastaFile = "../genbank/Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz"; open (FH, "zcat $agpFile|") or die "can not read $agpFile"; open (UC, "|gzip -c > unplaced.agp.gz") or die "can not write to unplaced.agp.gz"; while (my $line = ) { if ($line =~ m/^#/) { print UC $line; } else { $line =~ s/\.1//; printf UC "%s", $line; } } close (FH); close (UC); open (FH, "zcat $fastaFile|") or die "can not read $fastaFile"; open (UC, "|gzip -c > unplaced.fa.gz") or die "can not write to unplaced.fa.gz"; while (my $line = ) { if ($line =~ m/^>/) { chomp $line; $line =~ s/.*gb\|//; $line =~ s/\|.*//; $line =~ s/\.1//; printf UC ">$line\n"; } else { print UC $line; } } close (FH); close (UC); '_EOF_' # << happy emacs chmod +x unplaced.pl time ./unplaced.pl # real 16m41.592s faSize unplaced.fa.gz # 3631505655 bases (331640683 N's 3299864972 real 3299864972 upper # 0 lower) in 46558 sequences in 1 files # Total size: mean 77999.6 sd 423977.1 min 200 (AAGV03313855) # max 14349515 (JH573670) median 3708 # make sure none of the names got to be over 31 characers long: zcat unplaced.agp.gz | grep -v "^#" | cut -f1 | awk '{print length($0)}' \ | sort -rn | uniq -c | head # 23501 12 # 559883 8 ############################################################################# # Initial database build (DONE - 2012-04-11 - Hiram) cd /hive/data/genomes/dasNov3 cat << '_EOF_' > dasNov3.config.ra # Config parameters for makeGenomeDb.pl: db dasNov3 clade mammal genomeCladePriority 56 scientificName Dasypus novemcinctus commonName Armadillo assemblyDate Dec. 2011 assemblyLabel Baylor College of Medicine Dasnov3.0 (GCA_000208655.2) assemblyShortLabel Armadillo ncbiAssemblyName Dasnov3.0 ncbiAssemblyId 326198 orderKey 3459 mitoAcc NC_001821 fastaFiles /hive/data/genomes/dasNov3/genbank/unplaced.fa.gz agpFiles /hive/data/genomes/dasNov3/genbank/unplaced.agp.gz dbDbSpeciesDir armadillo taxId 9361 '_EOF_' # << happy emacs # verify sequence and agp are OK time makeGenomeDb.pl -workhorse=hgwdev -fileServer=hgwdev -dbHost=hgwdev \ -stop=agp dasNov3.config.ra > agp.log 2>&1 # real 4m40.661s # verify OK: tail -1 agp.log # *** All done! (through the 'agp' step) time makeGenomeDb.pl -continue=db -workhorse=hgwdev -fileServer=hgwdev \ -dbHost=hgwdev dasNov3.config.ra > db.log 2>&1 # real 30m16.480s ########################################################################## # running repeat masker (DONE - 2012-04-11 - Hiram) mkdir /hive/data/genomes/dasNov3/bed/repeatMasker cd /hive/data/genomes/dasNov3/bed/repeatMasker time doRepeatMasker.pl -buildDir=`pwd` -noSplit \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ -smallClusterHub=encodek dasNov3 > do.log 2>&1 & # real 1214m38.968s cat faSize.rmsk.txt # 3631522711 bases (331640683 N's 3299882028 real 1674428220 upper # 1625453808 lower) in 46559 sequences in 1 files # Total size: mean 77998.3 sd 423972.7 min 200 (AAGV03313855) # max 14349515 (JH573670) median 3708 # %44.76 masked total, %49.26 masked real egrep -i "versi|relea" do.log # April 26 2011 (open-3-3-0) version of RepeatMasker # CC RELEASE 20110920; # RepeatMasker version development-$Id: RepeatMasker,v 1.26 2011/09/26 16:19:44 angie Exp $ featureBits -countGaps dasNov3 rmsk # 1635908112 bases of 3631522711 (45.047%) in intersection # why is it different than the faSize above ? # because rmsk masks out some N's as well as bases, the count above # separates out the N's from the bases, it doesn't show lower case N's ########################################################################## # running simple repeat (DONE - 2012-04-11 - Hiram) mkdir /hive/data/genomes/dasNov3/bed/simpleRepeat cd /hive/data/genomes/dasNov3/bed/simpleRepeat time doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=swarm \ -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=encodek \ dasNov3 > do.log 2>&1 & # real 18m3.412s cat fb.simpleRepeat # 45167829 bases of 3299882059 (1.369%) in intersection # add to rmsk after it is done: cd /hive/data/genomes/dasNov3 twoBitMask dasNov3.rmsk.2bit \ -add bed/simpleRepeat/trfMask.bed dasNov3.2bit # you can safely ignore the warning about fields >= 13 twoBitToFa dasNov3.2bit stdout | faSize stdin > faSize.dasNov3.2bit.txt cat faSize.dasNov3.2bit.txt # 3631522711 bases (331640683 N's 3299882028 real 1673564758 upper # 1626317270 lower) in 46559 sequences in 1 files # Total size: mean 77998.3 sd 423972.7 min 200 (AAGV03313855) # max 14349515 (JH573670) median 3708 # %44.78 masked total, %49.28 masked real rm /gbdb/dasNov3/dasNov3.2bit ln -s `pwd`/dasNov3.2bit /gbdb/dasNov3/dasNov3.2bit ######################################################################### # Verify all gaps are marked, add any N's not in gap as type 'other' # (DONE - 2012-04-11 - Hiram) mkdir /hive/data/genomes/dasNov3/bed/gap cd /hive/data/genomes/dasNov3/bed/gap time nice -n +19 findMotif -motif=gattaca -verbose=4 \ -strand=+ ../../dasNov3.unmasked.2bit > findMotif.txt 2>&1 # real 1m10.266s grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed time featureBits dasNov3 -not gap -bed=notGap.bed # 3299882059 bases of 3299882059 (100.000%) in intersection # real 2m31.368s time featureBits dasNov3 allGaps.bed notGap.bed -bed=new.gaps.bed # 31 bases of 3299882059 (0.000%) in intersection # real 67m12.512s # 1 base gaps, one 2 base gap, not worth the bother hgsql -N -e "select bridge from gap;" dasNov3 | sort | uniq -c # 268413 yes ########################################################################## ## WINDOWMASKER (DONE - 2012-04-10 - Hiram) mkdir /hive/data/genomes/dasNov3/bed/windowMasker cd /hive/data/genomes/dasNov3/bed/windowMasker time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \ -dbHost=hgwdev dasNov3 > do.log 2>&1 & # real 280m54.607s # Masking statistics twoBitToFa dasNov3.wmsk.2bit stdout | faSize stdin # 3631522711 bases (331640683 N's 3299882028 real 1932049508 upper # 1367832520 lower) in 46559 sequences in 1 files # Total size: mean 77998.3 sd 423972.7 min 200 (AAGV03313855) # max 14349515 (JH573670) median 3708 # %37.67 masked total, %41.45 masked real twoBitToFa dasNov3.wmsk.sdust.2bit stdout | faSize stdin # 3631522711 bases (331640683 N's 3299882028 real 1910015235 upper # 1389866793 lower) in 46559 sequences in 1 files # Total size: mean 77998.3 sd 423972.7 min 200 (AAGV03313855) # max 14349515 (JH573670) median 3708 # %38.27 masked total, %42.12 masked real hgLoadBed dasNov3 windowmaskerSdust windowmasker.sdust.bed.gz # Read 19264039 elements of size 3 from windowmasker.sdust.bed.gz featureBits -countGaps dasNov3 windowmaskerSdust # 1721507449 bases of 3631522711 (47.405%) in intersection # eliminate the gaps from the masking featureBits dasNov3 -not gap -bed=notGap.bed # 3299882059 bases of 3299882059 (100.000%) in intersection time nice -n +19 featureBits dasNov3 windowmaskerSdust notGap.bed \ -bed=stdout | gzip -c > cleanWMask.bed.gz # 1389866797 bases of 3299882059 (42.119%) in intersection # real 36m17.629s # reload track to get it clean hgLoadBed dasNov3 windowmaskerSdust cleanWMask.bed.gz # Read 19267435 elements of size 4 from cleanWMask.bed.gz featureBits -countGaps dasNov3 windowmaskerSdust # 1389866797 bases of 3631522711 (38.272%) in intersection zcat cleanWMask.bed.gz \ | twoBitMask ../../dasNov3.unmasked.2bit stdin \ -type=.bed dasNov3.cleanWMSdust.2bit twoBitToFa dasNov3.cleanWMSdust.2bit stdout | faSize stdin \ > dasNov3.cleanWMSdust.faSize.txt cat dasNov3.cleanWMSdust.faSize.txt # 3631522711 bases (331640683 N's 3299882028 real 1910015235 upper # 1389866793 lower) in 46559 sequences in 1 files # Total size: mean 77998.3 sd 423972.7 min 200 (AAGV03313855) # max 14349515 (JH573670) median 3708 # %38.27 masked total, %42.12 masked real # how much does this window masker and repeat masker overlap: featureBits -countGaps dasNov3 rmsk windowmaskerSdust # 1019390239 bases of 3631522711 (28.071%) in intersection ######################################################################### # MASK SEQUENCE WITH WM+TRF (DONE - 2012-04-11 - Hiram) # Not masking with WM here since RM masked well # since rmsk masks so very little of the genome, use the clean WM mask # on the genome sequence # cd /hive/data/genomes/dasNov3 # twoBitMask -add bed/windowMasker/dasNov3.cleanWMSdust.2bit \ # bed/simpleRepeat/trfMask.bed dasNov3.2bit # safe to ignore the warnings about BED file with >=13 fields # twoBitToFa dasNov3.2bit stdout | faSize stdin > faSize.dasNov3.txt # cat faSize.dasNov3.txt # 927696114 bases (111611440 N's 816084674 real 607935500 upper # 208149174 lower) in 5678 sequences in 1 files # Total size: mean 163384.3 sd 1922194.0 min 1000 (AERX01077754-1) # max 51042256 (chrLG7) median 2009 # %22.44 masked total, %25.51 masked real # create symlink to gbdb # ssh hgwdev # rm /gbdb/dasNov3/dasNov3.2bit # ln -s `pwd`/dasNov3.2bit /gbdb/dasNov3/dasNov3.2bit ######################################################################## # cpgIslands - (DONE - 2011-04-24 - Hiram) mkdir /hive/data/genomes/dasNov3/bed/cpgIslands cd /hive/data/genomes/dasNov3/bed/cpgIslands time doCpgIslands.pl dasNov3 > do.log 2>&1 # real 98m44.307s cat fb.dasNov3.cpgIslandExt.txt # 59319215 bases of 3299882059 (1.798%) in intersection ######################################################################### # genscan - (DONE - 2011-04-26 - Hiram) mkdir /hive/data/genomes/dasNov3/bed/genscan cd /hive/data/genomes/dasNov3/bed/genscan time doGenscan.pl dasNov3 > do.log 2>&1 # Elapsed time: 161m31s cat fb.dasNov3.genscan.txt # 71941727 bases of 3299882059 (2.180%) in intersection cat fb.dasNov3.genscanSubopt.txt # 85113391 bases of 3299882059 (2.579%) in intersection ######################################################################### # MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2012-05-03 - Hiram) # Use -repMatch=900, based on size -- for human we use 1024 # use the "real" number from the faSize measurement, # hg19 is 2897316137, calculate the ratio factor for 1024: calc \( 3299882028 / 2897316137 \) \* 1024 # ( 3299882028 / 2897316137 ) * 1024 = 1166.279079 # round up to 1200 cd /hive/data/genomes/dasNov3 time blat dasNov3.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=jkStuff/dasNov3.11.ooc -repMatch=1200 # Wrote 34644 overused 11-mers to jkStuff/dasNov3.11.ooc # real 2m17.323s # there are no non-bridged gaps, no lift file needed for genbank hgsql -N -e "select bridge from gap;" dasNov3 | sort | uniq -c # 268413 yes # cd /hive/data/genomes/dasNov3/jkStuff # gapToLift dasNov3 dasNov3.nonBridged.lift -bedFile=dasNov3.nonBridged.bed # largest non-bridged contig: # awk '{print $3-$2,$0}' dasNov3.nonBridged.bed | sort -nr | head # 123773608 chrX 95534 123869142 chrX.01 ######################################################################### # AUTO UPDATE GENBANK (DONE - 2012-05-03 - Hiram) # examine the file: /cluster/data/genbank/data/organism.lst # for your species to see what counts it has for: # organism mrnaCnt estCnt refSeqCnt # Dasypus novemcinctus 23 0 0 # to decide which "native" mrna or ests you want to specify in genbank.conf ssh hgwdev cd $HOME/kent/src/hg/makeDb/genbank git pull # edit etc/genbank.conf to add dasNov3 just after ce2 # dasNov3 (armadillo) dasNov3.serverGenome = /hive/data/genomes/dasNov3/dasNov3.2bit dasNov3.clusterGenome = /hive/data/genomes/dasNov3/dasNov3.2bit dasNov3.ooc = /hive/data/genomes/dasNov3/jkStuff/dasNov3.11.ooc dasNov3.lift = no dasNov3.refseq.mrna.native.pslCDnaFilter = ${lowCover.refseq.mrna.native.pslCDnaFilter} dasNov3.refseq.mrna.xeno.pslCDnaFilter = ${lowCover.refseq.mrna.xeno.pslCDnaFilter} dasNov3.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter} dasNov3.genbank.mrna.xeno.pslCDnaFilter = ${lowCover.genbank.mrna.xeno.pslCDnaFilter} dasNov3.genbank.est.native.pslCDnaFilter = ${lowCover.genbank.est.native.pslCDnaFilter} dasNov3.refseq.mrna.native.load = no dasNov3.refseq.mrna.xeno.load = yes dasNov3.genbank.mrna.xeno.load = yes dasNov3.genbank.est.native.load = no dasNov3.downloadDir = dasNov3 dasNov3.perChromTables = no # end of section added to etc/genbank.conf git commit -m "adding dasNov3 armadillo" etc/genbank.conf git push make etc-update ssh hgwdev # used to do this on "genbank" machine screen -S dasNov3 # long running job managed in screen cd /cluster/data/genbank time nice -n +19 ./bin/gbAlignStep -initial dasNov3 & # var/build/logs/2012.05.03-16:02:26.dasNov3.initalign.log # real 1567m32.727s # load database when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad dasNov3 & # logFile: var/dbload/hgwdev/logs/2012.05.08-14:17:49.dbload.log # real 39m52.260s # enable daily alignment and update of hgwdev (DONE - 2012-02-09 - Hiram) cd ~/kent/src/hg/makeDb/genbank git pull # add dasNov3 to: vi etc/align.dbs etc/hgwdev.dbs git commit -m "Added dasNov3." etc/align.dbs etc/hgwdev.dbs git push make etc-update ######################################################################### # set default position to OPN5 gene displays (DONE - 2012-07-20 - Hiram) hgsql -e \ 'update dbDb set defaultPos="JH564832:1503619-1555676" where name="dasNov3";' \ hgcentraltest ############################################################################ # pushQ entry (DONE - 2012-07-20 - Hiram) mkdir /hive/data/genomes/dasNov3/pushQ cd /hive/data/genomes/dasNov3/pushQ # Mark says don't let the transMap track get there time makePushQSql.pl dasNov3 2> stderr.txt | grep -v transMap > dasNov3.sql scp -p dasNov3.sql hgwbeta:/tmp ssh hgwbeta cd /tmp hgsql qapushq < dasNov3.sql ############################################################################