# for emacs: -*- mode: sh; -*- # # the above keeps emacs happy while working with this text document # This file describes how we made the browser database on the # Trichechus manatus latirostris (Florida manatee) # Broad Institute # http://www.ncbi.nlm.nih.gov/bioproject/68243 # http://www.ncbi.nlm.nih.gov/genome/10467 # http://www.ncbi.nlm.nih.gov/genome/assembly/328638/ # http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AHIN01 # Genome Coverage : 150x # chrMt: NC_010302.1 - AM904728.1 # http://www.ncbi.nlm.nih.gov/bioproject/28669 # http://www.ncbi.nlm.nih.gov/nuccore/NC_010302.1 # DATE: 07-Oct-2011 # ORGANISM: Trichechus manatus latirostris # TAXID: 127582 # ASSEMBLY LONG NAME: TriManLat1.0 # ASSEMBLY SHORT NAME: TriManLat1.0 # ASSEMBLY SUBMITTER: Broad Institute of MIT and Harvard # ASSEMBLY TYPE: Haploid # NUMBER OF ASSEMBLY-UNITS: 1 # ASSEMBLY ACCESSION: GCA_000243295.1 # FTP-RELEASE DATE: 24-Jan-2012 # Eukaryota; Opisthokonta; Metazoa; Eumetazoa; Bilateria; Coelomata; # Deuterostomia; Chordata; Craniata; Vertebrata; Gnathostomata; # Teleostomi; Euteleostomi; Sarcopterygii; Tetrapoda; Amniota; Mammalia; # Theria; Eutheria; Afrotheria; Sirenia; Trichechidae; Trichechus; # Trichechus manatus ######################################################################### ## Download sequence (DONE - 2012-03-26 - Hiram) mkdir /hive/data/genomes/triMan1 mkdir /hive/data/genomes/triMan1/genbank cd /hive/data/genomes/triMan1/genbank rsync -a -P \ rsync://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Trichechus_manatus/TriManLat1.0/ ./ faSize Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz # 3103791524 bases (334708729 N's 2769082795 real 2769082795 upper 0 lower) # in 6322 sequences in 1 files # Total size: mean 490950.9 sd 2814444.1 # min 1000 (gi|373254797|gb|AHIN01166489.1|) # max 45942467 (gi|373522909|gb|JH594607.1|) median 3585 faCount Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz \ > faCount.txt egrep -v "^#seq|^total" faCount.txt \ | awk '{printf "%s\t%d\n", $1, $2}' \ | sed -e 's/^gi.*gb|//; s/|//' | sort -k2,2nr > chrom.sizes n50.pl chrom.sizes # reading: chrom.sizes # contig count: 6322, total size: 3103791524, one half size: 1551895762 # cumulative N50 count contig contig size 1545802679 66 JH594672.1 14775788 1551895762 one half size 1560245362 67 JH594674.1 14442683 # the automatic pickup of the chrMt sequence fails because the name # in the fasta is only: "Trichechus manatus" and the script is # expecting "Trichechus manatus latirostris" # So, pick up the sequence and include it with the wild card # specification of files here: wget -O NC_010302.NCBI.fa \ "http://www.ncbi.nlm.nih.gov/sviewer/viewer.fcgi?db=nuccore&dopt=fasta&sendto=on&id=NC_010302" sed -e 's/^>.*/>chrM/' NC_010302.NCBI.fa > chrM.fa faCount NC_010302.fa #seq len A C G T N cpg NC_010302 16882 5129 4760 2474 4519 0 472 echo "chrM 1 16882 1 O NC_010302.1 1 16882 +" | tr '[ ]' '[\t]' > chrM.agp ######################################################################### # process into UCSC naming scheme (DONE - 2012-03-27 - Hiram) mkdir /hive/data/genomes/triMan1/ucsc cd /hive/data/genomes/triMan1/ucsc # verify we don't have any .acc numbers different from .1 zcat \ ../genbank/Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz \ | cut -f1 | egrep "^JH|AHIN" \ | sed -e 's/^JH[0-9][0-9]*//; s/^AHIN[0-9][0-9]*//' | sort | uniq -c # 326656 .1 # this is like the unplaced.pl script in other assemblies except it # does not add chrUn_ to the names since they are all just scaffolds cat << '_EOF_' > unplaced.pl #!/bin/env perl use strict; use warnings; my $agpFile = "../genbank/Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz"; my $fastaFile = "../genbank/Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz"; open (FH, "zcat $agpFile|") or die "can not read $agpFile"; open (UC, "|gzip -c > unplaced.agp.gz") or die "can not write to unplaced.agp.gz"; while (my $line = ) { if ($line =~ m/^#/) { print UC $line; } else { $line =~ s/\.1//; printf UC "%s", $line; } } close (FH); close (UC); open (FH, "zcat $fastaFile|") or die "can not read $fastaFile"; open (UC, "|gzip -c > unplaced.fa.gz") or die "can not write to unplaced.fa.gz"; while (my $line = ) { if ($line =~ m/^>/) { chomp $line; $line =~ s/.*gb\|//; $line =~ s/\.1\|.*//; printf UC ">$line\n"; } else { print UC $line; } } close (FH); close (UC); '_EOF_' # << happy emacs chmod +x unplaced.pl time ./unplaced.pl # real 13m48.991s # -rw-rw-r-- 1 4475827 Mar 27 12:07 unplaced.agp.gz # -rw-rw-r-- 1 855931027 Mar 27 12:21 unplaced.fa.gz # verify nothing lost in the translation, should be the same as above # except for the name translations faSize *.fa.gz # 3103791524 bases (334708729 N's 2769082795 real 2769082795 upper 0 lower) # in 6322 sequences in 1 files # Total size: mean 490950.9 sd 2814444.1 min 1000 (AHIN01166489) # max 45942467 (JH594607) median 3585 # the automatic pickup of the chrMt sequence fails because the name # in the fasta is only: "Trichechus manatus" and the script is # expecting "Trichechus manatus latirostris" # So, pick up the sequence and include it with the wild card # specification of files here: cat ../genbank/chrM.fa | gzip -c > chrM.fa.gz cat ../genbank/chrM.agp | gzip -c > chrM.agp.gz ############################################################################# # Initial database build (DONE - 2012-03-27 - Hiram) cd /hive/data/genomes/triMan1 cat << '_EOF_' > triMan1.config.ra # Config parameters for makeGenomeDb.pl: db triMan1 clade mammal genomeCladePriority 57 scientificName Trichechus manatus latirostris commonName Manatee assemblyDate Oct. 2011 assemblyLabel Broad Institute of MIT and Harvard TriManLat1.0 (GCA_000243295.1) assemblyShortLabel BROAD v1.0 ncbiAssemblyName TriManLat1.0 ncbiAssemblyId 328638 orderKey 3380 # chrM already included mitoAcc none fastaFiles /hive/data/genomes/triMan1/ucsc/*.fa.gz agpFiles /hive/data/genomes/triMan1/ucsc/*.agp.gz dbDbSpeciesDir manatee taxId 127582 '_EOF_' # << happy emacs # first verify the sequence and AGP files are OK time makeGenomeDb.pl -stop=agp -workhorse=hgwdev triMan1.config.ra \ > agp.log 2>&1 # real 2m26.020s # verify that was OK, look at the agp.log file time makeGenomeDb.pl -continue=db -workhorse=hgwdev triMan1.config.ra \ > db.log 2>&1 # real 10m45.982s # verify that was OK, look at the do.log file # copy the trackDb business to the source tree, check it in and add # to the trackDb/makefile ############################################################################# # running repeat masker (DONE - 2012-03-27 - Hiram) mkdir /hive/data/genomes/triMan1/bed/repeatMasker cd /hive/data/genomes/triMan1/bed/repeatMasker time doRepeatMasker.pl -buildDir=`pwd` -noSplit \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ -smallClusterHub=encodek triMan1 > do.log 2>&1 & # real 903m55.735s cat faSize.rmsk.txt # 3103808406 bases (334708729 N's 2769099677 real 1766179614 upper # 1002920063 lower) in 6323 sequences in 1 files # Total size: mean 490875.9 sd 2814227.8 min 1000 (AHIN01166489) # max 45942467 (JH594607) median 3585 # %32.31 masked total, %36.22 masked real grep -i versi do.log # RepeatMasker version development-$Id: RepeatMasker,v 1.26 2011/09/26 16:19:44 angie Exp $ # April 26 2011 (open-3-3-0) version of RepeatMasker featureBits -countGaps triMan1 rmsk # 1005578625 bases of 3103808406 (32.398%) in intersection # why is it different than the faSize above ? # because rmsk masks out some N's as well as bases, this featureBits count # separates out the N's from the bases, it doesn't show lower case N's ########################################################################## # running simple repeat (DONE - 2012-03-27 - Hiram) mkdir /hive/data/genomes/triMan1/bed/simpleRepeat cd /hive/data/genomes/triMan1/bed/simpleRepeat time doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=swarm \ -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=encodek \ triMan1 > do.log 2>&1 & # real 14m44.480s cat fb.simpleRepeat # 22630835 bases of 2769099677 (0.817%) in intersection # Add this mask to the RM result: cd /hive/data/genomes/triMan1 twoBitMask triMan1.rmsk.2bit \ -add bed/simpleRepeat/trfMask.bed triMan1.2bit # you can safely ignore the warning about fields >= 13 twoBitToFa triMan1.2bit stdout | faSize stdin > faSize.triMan1.2bit.txt cat faSize.triMan1.2bit.txt # 2608572064 bases (131440969 N's 2477131095 real 1320629270 upper # 1156501825 lower) in 2685 sequences in 1 files # Total size: mean 971535.2 sd 4827933.6 min 1003 (AGCE01151413) # max 72162052 (JH378105) median 1553 # reset the symlink to provide this masked 2bit file rm /gbdb/triMan1/triMan1.2bit ln -s `pwd`/triMan1.2bit /gbdb/triMan1/triMan1.2bit ######################################################################### # Verify all gaps are marked, add any N's not in gap as type 'other' # (DONE - 2012-03-27 - Hiram) mkdir /hive/data/genomes/triMan1/bed/gap cd /hive/data/genomes/triMan1/bed/gap time nice -n +19 findMotif -motif=gattaca -verbose=4 \ -strand=+ ../../triMan1.unmasked.2bit > findMotif.txt 2>&1 # real 0m47.149s grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed time featureBits -countGaps triMan1 -not gap -bed=notGap.bed # 2769099677 bases of 3103808406 (89.216%) in intersection # real 0m15.811s time featureBits -countGaps triMan1 allGaps.bed notGap.bed -bed=new.gaps.bed # 0 bases of 3103808406 (0.000%) in intersection # real 5m7.362s # no unannotated gaps, no more to run here # there are no non-bridged gaps here: hgsql -N -e "select bridge from gap;" triMan1 | sort | uniq -c # 160167 yes ########################################################################## ## WINDOWMASKER (DONE - 2012-03-27 - Hiram) mkdir /hive/data/genomes/triMan1/bed/windowMasker cd /hive/data/genomes/triMan1/bed/windowMasker time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \ -dbHost=hgwdev triMan1 > do.log 2>&1 & # real 239m19.270s # Masking statistics twoBitToFa triMan1.wmsk.2bit stdout | faSize stdin # 3103808406 bases (334708729 N's 2769099677 real 1590352510 upper # 1178747167 lower) in 6323 sequences in 1 files # Total size: mean 490875.9 sd 2814227.8 min 1000 (AHIN01166489) # max 45942467 (JH594607) median 3585 # %37.98 masked total, %42.57 masked real twoBitToFa triMan1.wmsk.sdust.2bit stdout | faSize stdin # 3103808406 bases (334708729 N's 2769099677 real 1575281068 upper # 1193818609 lower) in 6323 sequences in 1 files # Total size: mean 490875.9 sd 2814227.8 min 1000 (AHIN01166489) # max 45942467 (JH594607) median 3585 # %38.46 masked total, %43.11 masked real hgLoadBed triMan1 windowmaskerSdust windowmasker.sdust.bed.gz # Read 14564308 elements of size 3 from windowmasker.sdust.bed.gz featureBits -countGaps triMan1 windowmaskerSdust # 1528527338 bases of 3103808406 (49.247%) in intersection # eliminate the gaps from the masking featureBits triMan1 -not gap -bed=notGap.bed # 2769099677 bases of 2769099677 (100.000%) in intersection time nice -n +19 featureBits triMan1 windowmaskerSdust notGap.bed \ -bed=stdout | gzip -c > cleanWMask.bed.gz # 1193818609 bases of 2769099677 (43.112%) in intersection # real 4m35.017s # reload track to get it clean hgLoadBed triMan1 windowmaskerSdust cleanWMask.bed.gz # Read 14633038 elements of size 4 from cleanWMask.bed.gz time featureBits -countGaps triMan1 windowmaskerSdust # 1193818609 bases of 3103808406 (38.463%) in intersection # real 1m25.053s # mask with this clean result zcat cleanWMask.bed.gz \ | twoBitMask ../../triMan1.unmasked.2bit stdin \ -type=.bed triMan1.cleanWMSdust.2bit twoBitToFa triMan1.cleanWMSdust.2bit stdout | faSize stdin \ > triMan1.cleanWMSdust.faSize.txt cat triMan1.cleanWMSdust.faSize.txt # 3103808406 bases (334708729 N's 2769099677 real 1575281068 upper # 1193818609 lower) in 6323 sequences in 1 files # Total size: mean 490875.9 sd 2814227.8 min 1000 (AHIN01166489) # max 45942467 (JH594607) median 3585 # %38.46 masked total, %43.11 masked real # how much does this window masker and repeat masker overlap: time featureBits -countGaps triMan1 rmsk windowmaskerSdust # 550218805 bases of 3103808406 (17.727%) in intersection # real 1m55.663s ######################################################################### # MASK SEQUENCE WITH WM+TRF (DONE - 2012-01-12 - Hiram) # not running this procedure since the RM result does well enough # cd /hive/data/genomes/triMan1 # twoBitMask -add bed/windowMasker/triMan1.cleanWMSdust.2bit \ # bed/simpleRepeat/trfMask.bed triMan1.2bit # safe to ignore the warnings about BED file with >=13 fields # twoBitToFa triMan1.2bit stdout | faSize stdin > faSize.triMan1.txt # cat faSize.triMan1.txt # 2860591921 bases (676999153 N's 2183592768 real 1186074582 upper # 997518186 lower) in 22819 sequences in 1 files # Total size: mean 125360.1 sd 395781.0 min 1000 (AFYH01291818) # max 10736886 (JH126562) median 5475 # %34.87 masked total, %45.68 masked real # create symlink to gbdb # ssh hgwdev # rm /gbdb/triMan1/triMan1.2bit # ln -s `pwd`/triMan1.2bit /gbdb/triMan1/triMan1.2bit ######################################################################### # MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2012-03-31 - Hiram) # Use -repMatch=650, based on size -- for human we use 1024 # use the "real" number from the faSize measurement, # hg19 is 2897316137, calculate the ratio factor for 1024: calc \( 2769099677 / 2897316137 \) \* 1024 # ( 2769099677 / 2897316137 ) * 1024 = 978.684388 # round up to 1000 cd /hive/data/genomes/triMan1 blat triMan1.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=jkStuff/triMan1.11.ooc -repMatch=1000 # Wrote 35505 overused 11-mers to jkStuff/triMan1.11.ooc # copy all of this stuff to the klusters: # there are no non-bridged gaps hgsql -N -e "select bridge from gap;" triMan1 | sort | uniq -c # 160167 yes # cd /hive/data/genomes/triMan1/jkStuff # gapToLift triMan1 nonBridged.lift -bedFile=nonBridged.bed cd /hive/data/genomes/triMan1 mkdir /hive/data/staging/data/triMan1 cp -p jkStuff/triMan1.11.ooc chrom.sizes \ triMan1.2bit /hive/data/staging/data/triMan1 # request rsync copy from cluster admin ######################################################################## # GENBANK AUTO UPDATE (WORKING - 2012-03-31 - Hiram) ssh hgwdev screen -S triMan1Genbank # use a screen to manage this job cd ~/kent/src/hg/makeDb/genbank git pull # add the following two lines to src/lib/gbGenome.c # static char *triManNames[] = {"Trichechus manatus", # "Trichechus manatus latirostris", NULL}; # and later: # {"triMan", triManNames}, git commit \ -m "adding Trichechus manatus latirostris (Florida manatee) triMan1" \ src/lib/gbGenome.c git push make install-server # edit etc/genbank.conf to add latMan1 just before dasNov1 # triMan1 (manatee) triMan1.serverGenome = /hive/data/genomes/triMan1/triMan1.2bit triMan1.clusterGenome = /hive/data/genomes/triMan1/triMan1.2bit triMan1.ooc = /hive/data/genomes/triMan1/jkStuff/triMan1.11.ooc triMan1.lift = no triMan1.refseq.mrna.native.pslCDnaFilter = ${ordered.refseq.mrna.native.pslCDnaFilter} triMan1.refseq.mrna.xeno.pslCDnaFilter = ${ordered.refseq.mrna.xeno.pslCDnaFilter} triMan1.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter} triMan1.genbank.mrna.xeno.pslCDnaFilter = ${ordered.genbank.mrna.xeno.pslCDnaFilter} triMan1.genbank.est.native.pslCDnaFilter = ${ordered.genbank.est.native.pslCDnaFilter} triMan1.refseq.mrna.native.load = no triMan1.refseq.mrna.xeno.load = yes triMan1.genbank.mrna.xeno.load = yes triMan1.genbank.est.native.load = no triMan1.downloadDir = triMan1 triMan1.perChromTables = no git commit \ -m "adding Trichechus manatus latirostris (Florida manatee) triMan1" \ etc/genbank.conf # update /cluster/data/genbank/: make etc-update cd /cluster/data/genbank time nice -n +19 bin/gbAlignStep -keep -initial triMan1 & # logFile: var/build/logs/2012.03.31-18:39:58.triMan1.initalign.log # real 2584m45.122s # load database when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad triMan1 # logFile: var/dbload/hgwdev/logs/2012.04.03-15:02:21.dbload.log # real 66m40.533s # enable daily alignment and update of hgwdev cd ~/kent/src/hg/makeDb/genbank get pull # add triMan1 to: etc/align.dbs etc/hgwdev.dbs git commit -m "Added triMan1 - Manatee" etc/align.dbs etc/hgwdev.dbs git push make etc-update ######################################################################## # cpgIslands - (DONE - 2011-04-23 - Hiram) mkdir /hive/data/genomes/triMan1/bed/cpgIslands cd /hive/data/genomes/triMan1/bed/cpgIslands time doCpgIslands.pl triMan1 > do.log 2>&1 # real 23m49.858s cat fb.triMan1.cpgIslandExt.txt # 18085037 bases of 2769099677 (0.653%) in intersection ######################################################################### # genscan - (DONE - 2011-04-26 - Hiram) mkdir /hive/data/genomes/triMan1/bed/genscan cd /hive/data/genomes/triMan1/bed/genscan time doGenscan.pl triMan1 > do.log 2>&1 # real 24m8.291s # five broken jobs, rerunning with 2000000 window size ./runGsBig.csh JH594608 000 gtf/000/JH594608.gtf pep/000/JH594608.pep subopt/000/JH594608.bed & ./runGsBig.csh JH594626 000 gtf/000/JH594626.gtf pep/000/JH594626.pep subopt/000/JH594626.bed & ./runGsBig.csh JH594628 000 gtf/000/JH594628.gtf pep/000/JH594628.pep subopt/000/JH594628.bed & ./runGsBig.csh JH594666 000 gtf/000/JH594666.gtf pep/000/JH594666.pep subopt/000/JH594666.bed & ./runGsBig.csh JH594816 000 gtf/000/JH594816.gtf pep/000/JH594816.pep subopt/000/JH594816.bed # about 20 minutes # continuing: time doGenscan.pl -continue=makeBed triMan1 > makeBed.log 2>&1 # real 4m0.970s cat fb.triMan1.genscan.txt # 78249566 bases of 2769099677 (2.826%) in intersection cat fb.triMan1.genscanSubopt.txt # 81010525 bases of 2769099677 (2.926%) in intersection ######################################################################### # set default position to RHO gene displays (DONE - 2012-07-26 - Hiram) hgsql -e \ 'update dbDb set defaultPos="JH594607:13029229-13038048" where name="triMan1";' \ hgcentraltest ############################################################################ # pushQ entry (DONE - 2012-07-26 - Hiram) mkdir /hive/data/genomes/triMan1/pushQ cd /hive/data/genomes/triMan1/pushQ # Mark says don't let the transMap track get there time makePushQSql.pl triMan1 2> stderr.txt | grep -v transMap > triMan1.sql # real 3m38.582s # check the stderr.txt for bad stuff, these kinds of warnings are OK: # WARNING: hgwdev does not have /gbdb/triMan1/wib/gc5Base.wib # WARNING: hgwdev does not have /gbdb/triMan1/wib/quality.wib # WARNING: hgwdev does not have /gbdb/triMan1/bbi/quality.bw # WARNING: triMan1 does not have seq # WARNING: triMan1 does not have extFile # WARNING: triMan1 does not have estOrientInfo scp -p triMan1.sql hgwbeta:/tmp ssh hgwbeta "hgsql qapushq < /tmp/triMan1.sql" ############################################################################