# for emacs: -*- mode: sh; -*- user 0m0.063s sys 0m0.066s # DATE: 03-Oct-2011 # ORGANISM: Tursiops truncatus # TAXID: 9739 # ASSEMBLY LONG NAME: Ttru_1.4 # ASSEMBLY SHORT NAME: Ttru_1.4 # ASSEMBLY SUBMITTER: Baylor College of Medicine # ASSEMBLY TYPE: Haploid # NUMBER OF ASSEMBLY-UNITS: 1 # ASSEMBLY ACCESSION: GCA_000151865.2 # FTP-RELEASE DATE: 03-Jan-2012 # http://www.ncbi.nlm.nih.gov/genome/769 # http://www.ncbi.nlm.nih.gov/genome/assembly/325108/ # http://www.ncbi.nlm.nih.gov/bioproject/20365 # http://www.ncbi.nlm.nih.gov/bioproject/34125 - chrMt NC_012059.1 # http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=ABRN02 # Genome Coverage : 2.5x Sanger; 3.5x 454; 30x Illumina # http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=9739 # rsync://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Tursiops_truncatus/Ttru_1.4/ ########################################################################## # Download sequence (DONE - 2012-04-11 - Hiram) mkdir /hive/data/genomes/turTru2 cd /hive/data/genomes/turTru2 mkdir genbank cd genbank time rsync -a -P \ rsync://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Tursiops_truncatus/Ttru_1.4/ ./ # real 14m52.432s # verify the size of the sequence here: faSize Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz # 2551980185 bases (219594130 N's 2332386055 real 2332386055 upper # 0 lower) in 240900 sequences in 1 files # Total size: mean 10593.5 sd 37887.7 # min 200 (gi|366711530|gb|ABRN02115282.1|) # max 1003560 (gi|366982705|gb|JH472452.1|) median 706 ########################################################################## # fixup names for UCSC standards (DONE - 2012-04-13 - Hiram) mkdir /hive/data/genomes/turTru2/ucsc cd /hive/data/genomes/turTru2/ucsc ######################## Unplaced scaffolds # verify we don't have any .acc numbers different from .1 zcat \ ../genbank/Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz \ | cut -f1 | egrep "^JH|^ABRN" \ | sed -e 's/^JH[0-9][0-9]*//; s/^ABRN[0-9][0-9]*//' | sort | uniq -c # 868326 .1 # this is like the unplaced.pl script in other assemblies except it # does not add chrUn_ to the names since they are all just scaffolds cat << '_EOF_' > unplaced.pl #!/bin/env perl use strict; use warnings; my $agpFile = "../genbank/Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz"; my $fastaFile = "../genbank/Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz"; open (FH, "zcat $agpFile|") or die "can not read $agpFile"; open (UC, "|gzip -c > unplaced.agp.gz") or die "can not write to unplaced.agp.gz"; while (my $line = ) { if ($line =~ m/^#/) { print UC $line; } else { $line =~ s/\.1//; printf UC "%s", $line; } } close (FH); close (UC); open (FH, "zcat $fastaFile|") or die "can not read $fastaFile"; open (UC, "|gzip -c > unplaced.fa.gz") or die "can not write to unplaced.fa.gz"; while (my $line = ) { if ($line =~ m/^>/) { chomp $line; $line =~ s/.*gb\|//; $line =~ s/\|.*//; $line =~ s/\.1//; printf UC ">$line\n"; } else { print UC $line; } } close (FH); close (UC); '_EOF_' # << happy emacs chmod +x unplaced.pl time ./unplaced.pl # real 11m26.151s # verify nothing lost from original: time faSize *.fa.gz # 2551980185 bases (219594130 N's 2332386055 real 2332386055 upper # 0 lower) in 240900 sequences in 1 files # Total size: mean 10593.5 sd 37887.7 min 200 (ABRN02115282) # max 1003560 (JH472452) median 706 # make sure none of the names got to be over 31 characers long: zcat *.agp.gz | grep -v "^#" | cut -f1 | awk '{print length($0)}' \ | sort -rn | uniq -c | head # 181924 12 # 686402 8 ########################################################################## # Initial makeGenomeDb.pl (DONE - 2012-04-13 - Hiram) cd /hive/data/genomes/turTru2 cat << '_EOF_' > turTru2.config.ra # Config parameters for makeGenomeDb.pl: db turTru2 clade mammal genomeCladePriority 35 scientificName Tursiops truncatus commonName Dolphin assemblyDate Oct. 2011 assemblyLabel Baylor College of Medicine Ttru_1.4 (NCBI project 20365, GCA_000151865.2, WGS ABRN02) assemblyShortLabel Baylor Ttru_1.4 ncbiAssemblyName Ttru_1.4 ncbiAssemblyId 325108 orderKey 2364 mitoAcc NC_012059 fastaFiles /hive/data/genomes/turTru2/ucsc/*.fa.gz agpFiles /hive/data/genomes/turTru2/ucsc/*.agp.gz # qualFiles none dbDbSpeciesDir dolphin taxId 9739 '_EOF_' # << happy emacs # verify sequence and agp are OK time makeGenomeDb.pl -workhorse=hgwdev -fileServer=hgwdev -dbHost=hgwdev \ -stop=agp turTru2.config.ra > agp.log 2>&1 # real 1m56.055s # verify OK: tail -1 agp.log # *** All done! (through the 'agp' step) # finish it off time makeGenomeDb.pl -continue=db -workhorse=hgwdev -fileServer=hgwdev \ -dbHost=hgwdev turTru2.config.ra > db.log 2>&1 # real 21m20.515s # add the trackDb entries to the source tree, and the 2bit link: ln -s `pwd`/turTru2.unmasked.2bit /gbdb/turTru2/turTru2.2bit # browser should function now ######################################################################### # running repeat masker (DONE - 2012-04-13 - Hiram) mkdir /hive/data/genomes/turTru2/bed/repeatMasker cd /hive/data/genomes/turTru2/bed/repeatMasker time doRepeatMasker.pl -buildDir=`pwd` -noSplit \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ -smallClusterHub=encodek turTru2 > do.log 2>&1 & # real 1304m42.773s cat faSize.rmsk.txt # 2551996573 bases (219594130 N's 2332402443 real 1317197601 upper # 1015204842 lower) in 240901 sequences in 1 files # Total size: mean 10593.5 sd 37887.7 min 200 (ABRN02115282) # max 1003560 (JH472452) median 706 # %39.78 masked total, %43.53 masked real egrep -i "versi|relea" do.log # April 26 2011 (open-3-3-0) version of RepeatMasker # CC RELEASE 20110920; # RepeatMasker version development-$Id: RepeatMasker,v 1.26 2011/09/26 16:19:44 angie Exp $ time featureBits -countGaps turTru2 rmsk # 1019407879 bases of 2551996573 (39.946%) in intersection # real 2m37.291s # why is it different than the faSize above ? # because rmsk masks out some N's as well as bases, the count above # separates out the N's from the bases, it doesn't show lower case N's ########################################################################## # running simple repeat (DONE - 2012-04-13 - Hiram) mkdir /hive/data/genomes/turTru2/bed/simpleRepeat cd /hive/data/genomes/turTru2/bed/simpleRepeat time doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=swarm \ -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=encodek \ turTru2 > do.log 2>&1 & # real 180m2.834s cat fb.simpleRepeat # 35337050 bases of 2332423317 (1.515%) in intersection # add to rmsk after it is done: cd /hive/data/genomes/turTru2 twoBitMask turTru2.rmsk.2bit \ -add bed/simpleRepeat/trfMask.bed turTru2.2bit # you can safely ignore the warning about fields >= 13 twoBitToFa turTru2.2bit stdout | faSize stdin > faSize.turTru2.2bit.txt cat faSize.turTru2.2bit.txt # 2551996573 bases (219594130 N's 2332402443 real 1316500848 upper # 1015901595 lower) in 240901 sequences in 1 files # Total size: mean 10593.5 sd 37887.7 min 200 (ABRN02115282) # max 1003560 (JH472452) median 706 # %39.81 masked total, %43.56 masked real rm /gbdb/turTru2/turTru2.2bit ln -s `pwd`/turTru2.2bit /gbdb/turTru2/turTru2.2bit ######################################################################### # Verify all gaps are marked, add any N's not in gap as type 'other' # (DONE - 2012-04-13 - Hiram) mkdir /hive/data/genomes/turTru2/bed/gap cd /hive/data/genomes/turTru2/bed/gap time nice -n +19 findMotif -motif=gattaca -verbose=4 \ -strand=+ ../../turTru2.unmasked.2bit > findMotif.txt 2>&1 # real 0m26.240s grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed featureBits turTru2 -not gap -bed=notGap.bed # 2332423317 bases of 2332423317 (100.000%) in intersection time featureBits turTru2 allGaps.bed notGap.bed -bed=new.gaps.bed # 20874 bases of 2332423317 (0.001%) in intersection # real 496m11.739s wc -l new.gaps.bed # 20823 new.gaps.bed # the biggest one here is 4 bases: awk '{print $3-$2,$0}' new.gaps.bed | sort -rn | less # what is the highest index in the existing gap table: hgsql -N -e "select ix from gap;" turTru2 | sort -n | tail -1 # 216 cat << '_EOF_' > mkGap.pl #!/bin/env perl use strict; use warnings; my $ix=`hgsql -N -e "select ix from gap;" turTru2 | sort -n | tail -1`; chomp $ix; open (FH,") { my ($chrom, $chromStart, $chromEnd, $rest) = split('\s+', $line); ++$ix; printf "%s\t%d\t%d\t%d\tN\t%d\tother\tyes\n", $chrom, $chromStart, $chromEnd, $ix, $chromEnd-$chromStart; } close (FH); '_EOF_' # << happy emacs chmod +x ./mkGap.pl ./mkGap.pl > other.bed featureBits -countGaps turTru2 other.bed # 20874 bases of 2551996573 (0.001%) in intersection # real 11m57.952s wc -l other.bed # 20823 hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/gap.sql \ -noLoad turTru2 otherGap other.bed # Read 20823 elements of size 8 from other.bed # starting with this many hgsql -e "select count(*) from gap;" turTru2 # 313713 hgsql turTru2 -e 'load data local infile "bed.tab" into table gap;' # result count: hgsql -e "select count(*) from gap;" turTru2 # 334536 # == 20823 + 313713 # verify we aren't adding gaps where gaps already exist # this would output errors if that were true: gapToLift -minGap=1 turTru2 nonBridged.lift -bedFile=nonBridged.bed # one of these new ones is adjacent to an existing gap: # see example in danRer7.txt # are there non-bridged gaps here: hgsql -N -e "select bridge from gap;" turTru2 | sort | uniq -c # 334536 yes ########################################################################## ## WINDOWMASKER (DONE - 2012-04-13 - Hiram) mkdir /hive/data/genomes/turTru2/bed/windowMasker cd /hive/data/genomes/turTru2/bed/windowMasker time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \ -dbHost=hgwdev turTru2 > do.log 2>&1 & # real 181m2.383s # Masking statistics twoBitToFa turTru2.wmsk.2bit stdout | faSize stdin # 2551996573 bases (219594130 N's 2332402443 real 1557512287 upper # 774890156 lower) in 240901 sequences in 1 files # Total size: mean 10593.5 sd 37887.7 min 200 (ABRN02115282) # max 1003560 (JH472452) median 706 # %30.36 masked total, %33.22 masked real twoBitToFa turTru2.wmsk.sdust.2bit stdout | faSize stdin # 2551996573 bases (219594130 N's 2332402443 real 1545284164 upper # 787118279 lower) in 240901 sequences in 1 files # Total size: mean 10593.5 sd 37887.7 min 200 (ABRN02115282) # max 1003560 (JH472452) median 706 # %30.84 masked total, %33.75 masked real hgLoadBed turTru2 windowmaskerSdust windowmasker.sdust.bed.gz # Read 13508939 elements of size 3 from windowmasker.sdust.bed.gz time featureBits -countGaps turTru2 windowmaskerSdust # 1006694538 bases of 2551996573 (39.447%) in intersection # real 3m33.405s # eliminate the gaps from the masking time featureBits turTru2 -not gap -bed=notGap.bed # 2332402443 bases of 2332402443 (100.000%) in intersection # real 1m30.308s time nice -n +19 featureBits turTru2 windowmaskerSdust notGap.bed \ -bed=stdout | gzip -c > cleanWMask.bed.gz # 787118279 bases of 2332402443 (33.747%) in intersection # real 318m12.378s # reload track to get it clean hgLoadBed turTru2 windowmaskerSdust cleanWMask.bed.gz # Read 13532042 elements of size 4 from cleanWMask.bed.gz time featureBits -countGaps turTru2 windowmaskerSdust # 787118279 bases of 2551996573 (30.843%) in intersection # real 3m20.475s zcat cleanWMask.bed.gz \ | twoBitMask ../../turTru2.unmasked.2bit stdin \ -type=.bed turTru2.cleanWMSdust.2bit twoBitToFa turTru2.cleanWMSdust.2bit stdout | faSize stdin \ > turTru2.cleanWMSdust.faSize.txt cat turTru2.cleanWMSdust.faSize.txt # 2551996573 bases (219594130 N's 2332402443 real 1545284164 upper # 787118279 lower) in 240901 sequences in 1 files # Total size: mean 10593.5 sd 37887.7 min 200 (ABRN02115282) # max 1003560 (JH472452) median 706 # %30.84 masked total, %33.75 masked real # how much does this window masker and repeat masker overlap: time featureBits -countGaps turTru2 rmsk windowmaskerSdust # 553188559 bases of 2551996573 (21.677%) in intersection # real 5m33.965s ######################################################################### # MASK SEQUENCE WITH WM+TRF (DONE - 2012-04-13 - Hiram) # since rmsk masks so very little of the genome, use the clean WM mask # on the genome sequence # cd /hive/data/genomes/turTru2 # twoBitMask -add bed/windowMasker/turTru2.cleanWMSdust.2bit \ # bed/simpleRepeat/trfMask.bed turTru2.2bit # safe to ignore the warnings about BED file with >=13 fields # twoBitToFa turTru2.2bit stdout | faSize stdin > faSize.turTru2.txt # cat faSize.turTru2.txt # 927696114 bases (111611440 N's 816084674 real 607935500 upper # 208149174 lower) in 5678 sequences in 1 files # Total size: mean 163384.3 sd 1922194.0 min 1000 (AERX01077754-1) # max 51042256 (chrLG7) median 2009 # %22.44 masked total, %25.51 masked real # create symlink to gbdb # rm /gbdb/turTru2/turTru2.2bit # ln -s `pwd`/turTru2.2bit /gbdb/turTru2/turTru2.2bit ######################################################################## # cpgIslands - (DONE - 2011-04-23 - Hiram) mkdir /hive/data/genomes/turTru2/bed/cpgIslands cd /hive/data/genomes/turTru2/bed/cpgIslands time doCpgIslands.pl turTru2 > do.log 2>&1 # fixing broken command in the 'head' calculation doCpgIslands.pl -continue=cleanup turTru2 # Elapsed time: 8m21s cat fb.turTru2.cpgIslandExt.txt # 29423228 bases of 2332402443 (1.261%) in intersection ######################################################################### # genscan - (DONE - 2011-04-26 - Hiram) mkdir /hive/data/genomes/turTru2/bed/genscan cd /hive/data/genomes/turTru2/bed/genscan time doGenscan.pl turTru2 > do.log 2>&1 # recovering from power failure, kluster run had just finished # and it had just started on makeBed: time ./doMakeBed.csh # real 298m41.971s time doGenscan.pl -continue=load turTru2 > load.log 2>&1 # real 17m53.512s # failed out of inodes on the hive FS, continuing: time doGenscan.pl -continue=cleanup turTru2 > cleanup.log 2>&1 # real 20m48.384s cat fb.turTru2.genscan.txt # 53945792 bases of 2332402443 (2.313%) in intersection cat fb.turTru2.genscanSubopt.txt # 59031653 bases of 2332402443 (2.531%) in intersection ######################################################################### # MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2012-05-07 - Hiram) # Use -repMatch=900, based on size -- for human we use 1024 # use the "real" number from the faSize measurement, # hg19 is 2897316137, calculate the ratio factor for 1024: calc \( 2332402443 / 2897316137 \) \* 1024 # ( 2332402443 / 2897316137 ) * 1024 = 824.342249 # round up to 850 cd /hive/data/genomes/turTru2 time blat turTru2.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=jkStuff/turTru2.11.ooc -repMatch=850 # Wrote 27158 overused 11-mers to jkStuff/turTru2.11.ooc # real 1m20.824s # there are no non-bridged gaps, no lift file needed for genbank hgsql -N -e "select bridge from gap;" turTru2 | sort | uniq -c # 334536 yes # cd /hive/data/genomes/turTru2/jkStuff # gapToLift turTru2 turTru2.nonBridged.lift -bedFile=turTru2.nonBridged.bed # largest non-bridged contig: # awk '{print $3-$2,$0}' turTru2.nonBridged.bed | sort -nr | head # 123773608 chrX 95534 123869142 chrX.01 ######################################################################### # AUTO UPDATE GENBANK (DONE - 2012-05-07 - Hiram) # examine the file: /cluster/data/genbank/data/organism.lst # for your species to see what counts it has for: # organism mrnaCnt estCnt refSeqCnt # Tursiops truncatus 102 8040 0 # to decide which "native" mrna or ests you want to specify in genbank.conf ssh hgwdev cd $HOME/kent/src/hg/makeDb/genbank git pull # edit etc/genbank.conf to add turTru2 just after ce2 # turTru2 (Dolphin) turTru2.serverGenome = /hive/data/genomes/turTru2/turTru2.2bit turTru2.clusterGenome = /hive/data/genomes/turTru2/turTru2.2bit turTru2.ooc = /hive/data/genomes/turTru2/jkStuff/turTru2.11.ooc turTru2.lift = no turTru2.refseq.mrna.native.pslCDnaFilter = ${lowCover.refseq.mrna.native.pslCDnaFilter} turTru2.refseq.mrna.xeno.pslCDnaFilter = ${lowCover.refseq.mrna.xeno.pslCDnaFilter} turTru2.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter} turTru2.genbank.mrna.xeno.pslCDnaFilter = ${lowCover.genbank.mrna.xeno.pslCDnaFilter} turTru2.genbank.est.native.pslCDnaFilter = ${lowCover.genbank.est.native.pslCDnaFilter} turTru2.refseq.mrna.native.load = no turTru2.refseq.mrna.xeno.load = yes turTru2.genbank.mrna.native.load = no turTru2.genbank.mrna.xeno.load = yes turTru2.genbank.est.native.load = no turTru2.downloadDir = turTru2 turTru2.perChromTables = no # end of section added to etc/genbank.conf git commit -m "adding turTru2 Dolphin" etc/genbank.conf git push make etc-update git pull # Edit src/lib/gbGenome.c to add new species. git commit -m "adding definition for turTruNames Tursiops truncatus" \ src/lib/gbGenome.c git push make install-server ssh hgwdev # used to do this on "genbank" machine screen -S turTru2 # long running job managed in screen cd /cluster/data/genbank time nice -n +19 ./bin/gbAlignStep -initial turTru2 & # var/build/logs/2012.06.08-10:05:05.turTru2.initalign.log # real 1400m6.762s # load database when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad turTru2 & # logFile: var/dbload/hgwdev/logs/2012.06.12-13:43:53.dbload.log # real 4m45.316s # enable daily alignment and update of hgwdev (DONE - 2012-06-12 - Hiram) cd ~/kent/src/hg/makeDb/genbank git pull # add turTru2 to: vi etc/align.dbs etc/hgwdev.dbs git commit -m "Added turTru2." etc/align.dbs etc/hgwdev.dbs git push make etc-update ############################################################################## # LASTZi cow bosTau7 (DONE 2012-06-19 - Chin) # original alignment done at bosTau7.txt cd /hive/data/genomes/bosTau7/bed/lastzTurTru2.2012-06-19 cat fb.bosTau7.chainTurTru2Link.txt # 1696154184 bases of 2804673174 (60.476%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/bosTau7/bed ln -s lastzTurTru2.2012-06-19 lastz.turTru2 # and the swap mkdir /hive/data/genomes/turTru2/bed/blastz.bosTau7.swap cd /hive/data/genomes/turTru2/bed/blastz.bosTau7.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/bosTau7/bed/lastzTurTru2.2012-06-19/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 532m53.448s cat fb.turTru2.chainBosTau7Link.txt # 1625706583 bases of 2332402443 (69.701%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/turTru2/bed ln -s blastz.bosTau7.swap lastz.bosTau7 ######################################################################### # set default position to RHO gene displays (DONE - 2012-07-26 - Hiram) hgsql -e \ 'update dbDb set defaultPos="JH481929:22304-29703" where name="turTru2";' \ hgcentraltest ############################################################################ # pushQ entry (DONE - 2012-07-26 - Hiram) mkdir /hive/data/genomes/turTru2/pushQ cd /hive/data/genomes/turTru2/pushQ # Mark says don't let the transMap track get there time makePushQSql.pl turTru2 2> stderr.txt | grep -v transMap > turTru2.sql # real 3m41.168s # check the stderr.txt for bad stuff, these kinds of warnings are OK: # WARNING: hgwdev does not have /gbdb/turTru2/wib/gc5Base.wib # WARNING: hgwdev does not have /gbdb/turTru2/wib/quality.wib # WARNING: hgwdev does not have /gbdb/turTru2/bbi/quality.bw # WARNING: turTru2 does not have seq # WARNING: turTru2 does not have extFile # WARNING: turTru2 does not have estOrientInfo scp -p turTru2.sql hgwbeta:/tmp ssh hgwbeta "hgsql qapushq < /tmp/turTru2.sql" ############################################################################