# for emacs: -*- mode: sh; -*- # DATE: 23-Jan-2012 # ORGANISM: Heterocephalus glaber # TAXID: 10181 # ASSEMBLY LONG NAME: HetGla_female_1.0 # ASSEMBLY SHORT NAME: HetGla_female_1.0 # ASSEMBLY SUBMITTER: Broad Institute # ASSEMBLY TYPE: Haploid # NUMBER OF ASSEMBLY-UNITS: 1 # ASSEMBLY ACCESSION: GCA_000247695.1 # FTP-RELEASE DATE: 10-Mar-2012 # http://www.ncbi.nlm.nih.gov/genome/6995 # http://www.ncbi.nlm.nih.gov/genome/assembly/362148/ # http://www.ncbi.nlm.nih.gov/bioproject/72441 # http://www.ncbi.nlm.nih.gov/bioproject/62663 - chrMt - NC_015112.1 # http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AHKG01 # Genome Coverage : 90x # http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=10181 # rsync://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Heterocephalus_glaber/HetGla_female_1.0/ ########################################################################## # Download sequence (DONE - 2012-04-13 - Hiram) mkdir /hive/data/genomes/hetGla2 cd /hive/data/genomes/hetGla2 mkdir genbank cd genbank time rsync -a -P \ rsync://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Heterocephalus_glaber/HetGla_female_1.0/ ./ # real 15m42.132s # verify the size of the sequence here: faSize Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz # 2618188253 bases (303433536 N's 2314754717 real 2314754717 upper # 0 lower) in 4228 sequences in 1 files # Total size: mean 619249.8 sd 3685910.4 # min 1000 (gi|375378318|gb|AHKG01114633.1|) # max 78885850 (gi|376403570|gb|JH602043.1|) median 1736 ########################################################################## # fixup names for UCSC standards (DONE - 2012-04-13 - Hiram) mkdir /hive/data/genomes/hetGla2/ucsc cd /hive/data/genomes/hetGla2/ucsc ######################## Unplaced scaffolds # verify we don't have any .acc numbers different from .1 zcat \ ../genbank/Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz \ | cut -f1 | egrep "^JH|^AHKG" \ | sed -e 's/^JH[0-9][0-9]*//; s/^AHKG[0-9][0-9]*//' | sort | uniq -c # 225076 .1 # this is like the unplaced.pl script in other assemblies except it # does not add chrUn_ to the names since they are all just scaffolds cat << '_EOF_' > unplaced.pl #!/bin/env perl use strict; use warnings; my $agpFile = "../genbank/Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz"; my $fastaFile = "../genbank/Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz"; open (FH, "zcat $agpFile|") or die "can not read $agpFile"; open (UC, "|gzip -c > unplaced.agp.gz") or die "can not write to unplaced.agp.gz"; while (my $line = ) { if ($line =~ m/^#/) { print UC $line; } else { $line =~ s/\.1//; printf UC "%s", $line; } } close (FH); close (UC); open (FH, "zcat $fastaFile|") or die "can not read $fastaFile"; open (UC, "|gzip -c > unplaced.fa.gz") or die "can not write to unplaced.fa.gz"; while (my $line = ) { if ($line =~ m/^>/) { chomp $line; $line =~ s/.*gb\|//; $line =~ s/\|.*//; $line =~ s/\.1//; printf UC ">$line\n"; } else { print UC $line; } } close (FH); close (UC); '_EOF_' # << happy emacs chmod +x unplaced.pl time ./unplaced.pl # real 11m32.064s # verify nothing lost from original: time faSize *.fa.gz # 2618188253 bases (303433536 N's 2314754717 real 2314754717 upper 0 # lower) in 4228 sequences in 1 files # Total size: mean 619249.8 sd 3685910.4 min 1000 (AHKG01114633) # max 78885850 (JH602043) median 1736 # real 0m45.073s # make sure none of the names got to be over 31 characers long: zcat *.agp.gz | grep -v "^#" | cut -f1 | awk '{print length($0)}' \ | sort -rn | uniq -c | head # 3134 12 # 221942 8 ########################################################################## # Initial makeGenomeDb.pl (DONE - 2012-04-13 - Hiram) cd /hive/data/genomes/hetGla2 # mitoAcc - chrMt sequence is included in the download files cat << '_EOF_' > hetGla2.config.ra # Config parameters for makeGenomeDb.pl: db hetGla2 clade mammal genomeCladePriority 35 scientificName Heterocephalus glaber commonName Naked Mole Rat assemblyDate Jan. 2012 assemblyLabel Broad Institute HetGla_female_1.0 (NCBI project 72441, GCA_000247695.1, WGS AHKG01) assemblyShortLabel Broad HetGla_female_1.0 ncbiAssemblyName HetGla_female_1.0 ncbiAssemblyId 362148 orderKey 1649 mitoAcc NC_015112 fastaFiles /hive/data/genomes/hetGla2/ucsc/*.fa.gz agpFiles /hive/data/genomes/hetGla2/ucsc/*.agp.gz # qualFiles none dbDbSpeciesDir moleRat taxId 10181 '_EOF_' # << happy emacs # verify sequence and agp are OK time makeGenomeDb.pl -workhorse=hgwdev -fileServer=hgwdev -dbHost=hgwdev \ -stop=agp hetGla2.config.ra > agp.log 2>&1 # real 1m59.793s # verify OK: tail -1 agp.log # *** All done! (through the 'agp' step) # finish it off time makeGenomeDb.pl -continue=db -workhorse=hgwdev -fileServer=hgwdev \ -dbHost=hgwdev hetGla2.config.ra > db.log 2>&1 # real 17m39.702s # add the trackDb entries to the source tree, and the 2bit link: ln -s `pwd`/hetGla2.unmasked.2bit /gbdb/hetGla2/hetGla2.2bit # browser should function now ######################################################################### # running repeat masker (DONE - 2012-04-13 - Hiram) mkdir /hive/data/genomes/hetGla2/bed/repeatMasker cd /hive/data/genomes/hetGla2/bed/repeatMasker time doRepeatMasker.pl -buildDir=`pwd` -noSplit \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ -smallClusterHub=encodek hetGla2 > do.log 2>&1 & # real 1042m39.868s cat faSize.rmsk.txt # 2618204639 bases (303433536 N's 2314771103 real 1567576531 upper # 747194572 lower) in 4229 sequences in 1 files # Total size: mean 619107.3 sd 3685486.1 min 1000 (AHKG01114633) # max 78885850 (JH602043) median 1736 # %28.54 masked total, %32.28 masked real egrep -i "versi|relea" do.log # April 26 2011 (open-3-3-0) version of RepeatMasker # CC RELEASE 20110920; # RepeatMasker version development-$Id: RepeatMasker,v 1.26 2011/09/26 16:19:44 angie Exp $ featureBits -countGaps hetGla2 rmsk # 748037030 bases of 2618204639 (28.571%) in intersection # real 0m25.109s # why is it different than the faSize above ? # because rmsk masks out some N's as well as bases, the count above # separates out the N's from the bases, it doesn't show lower case N's ########################################################################## # running simple repeat (DONE - 2012-04-13 - Hiram) mkdir /hive/data/genomes/hetGla2/bed/simpleRepeat cd /hive/data/genomes/hetGla2/bed/simpleRepeat time doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=swarm \ -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=encodek \ hetGla2 > do.log 2>&1 & # real 14m26.766s cat fb.simpleRepeat # 23745957 bases of 2314771103 (1.026%) in intersection # add to rmsk after it is done: cd /hive/data/genomes/hetGla2 twoBitMask hetGla2.rmsk.2bit \ -add bed/simpleRepeat/trfMask.bed hetGla2.2bit # you can safely ignore the warning about fields >= 13 twoBitToFa hetGla2.2bit stdout | faSize stdin > faSize.hetGla2.2bit.txt cat faSize.hetGla2.2bit.txt # 2618204639 bases (303433536 N's 2314771103 real 1566940532 upper # 747830571 lower) in 4229 sequences in 1 files # Total size: mean 619107.3 sd 3685486.1 min 1000 (AHKG01114633) # max 78885850 (JH602043) median 1736 # %28.56 masked total, %32.31 masked real rm /gbdb/hetGla2/hetGla2.2bit ln -s `pwd`/hetGla2.2bit /gbdb/hetGla2/hetGla2.2bit ######################################################################### # Verify all gaps are marked, add any N's not in gap as type 'other' # (DONE - 2012-04-13 - Hiram) mkdir /hive/data/genomes/hetGla2/bed/gap cd /hive/data/genomes/hetGla2/bed/gap time nice -n +19 findMotif -motif=gattaca -verbose=4 \ -strand=+ ../../hetGla2.unmasked.2bit > findMotif.txt 2>&1 # real 0m20.133s grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed featureBits hetGla2 -not gap -bed=notGap.bed # 2314771103 bases of 2314771103 (100.000%) in intersection # real 0m16.020s time featureBits hetGla2 allGaps.bed notGap.bed -bed=new.gaps.bed # 0 bases of 2314771103 (0.000%) in intersection # real 2m28.844s # no new gaps, nothing to do here # are there non-bridged gaps here: hgsql -N -e "select bridge from gap;" hetGla2 | sort | uniq -c # 110424 yes ########################################################################## ## WINDOWMASKER (DONE - 2012-04-13 - Hiram) mkdir /hive/data/genomes/hetGla2/bed/windowMasker cd /hive/data/genomes/hetGla2/bed/windowMasker time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \ -dbHost=hgwdev hetGla2 > do.log 2>&1 & # real 198m5.178s # Masking statistics twoBitToFa hetGla2.wmsk.2bit stdout | faSize stdin # 2618204639 bases (303433536 N's 2314771103 real 1603612350 upper # 711158753 lower) in 4229 sequences in 1 files # Total size: mean 619107.3 sd 3685486.1 min 1000 (AHKG01114633) # max 78885850 (JH602043) median 1736 # %27.16 masked total, %30.72 masked real twoBitToFa hetGla2.wmsk.sdust.2bit stdout | faSize stdin # 2618204639 bases (303433536 N's 2314771103 real 1591491612 upper # 723279491 lower) in 4229 sequences in 1 files # Total size: mean 619107.3 sd 3685486.1 min 1000 (AHKG01114633) # max 78885850 (JH602043) median 1736 # %27.63 masked total, %31.25 masked real hgLoadBed hetGla2 windowmaskerSdust windowmasker.sdust.bed.gz # Read 13599339 elements of size 3 from windowmasker.sdust.bed.gz time featureBits -countGaps hetGla2 windowmaskerSdust # 1185785055 bases of 2808525991 (42.221%) in intersection # eliminate the gaps from the masking time featureBits hetGla2 -not gap -bed=notGap.bed # 2314771103 bases of 2314771103 (100.000%) in intersection # real 0m15.302s # 2525294057 bases of 2525294057 (100.000%) in intersection time nice -n +19 featureBits hetGla2 windowmaskerSdust notGap.bed \ -bed=stdout | gzip -c > cleanWMask.bed.gz # 723279491 bases of 2314771103 (31.246%) in intersection # real 3m10.554s # reload track to get it clean hgLoadBed hetGla2 windowmaskerSdust cleanWMask.bed.gz # Read 13672561 elements of size 4 from cleanWMask.bed.gz time featureBits -countGaps hetGla2 windowmaskerSdust # 723279491 bases of 2618204639 (27.625%) in intersection # real 1m20.945s zcat cleanWMask.bed.gz \ | twoBitMask ../../hetGla2.unmasked.2bit stdin \ -type=.bed hetGla2.cleanWMSdust.2bit twoBitToFa hetGla2.cleanWMSdust.2bit stdout | faSize stdin \ > hetGla2.cleanWMSdust.faSize.txt cat hetGla2.cleanWMSdust.faSize.txt # 2618204639 bases (303433536 N's 2314771103 real 1591491612 upper # 723279491 lower) in 4229 sequences in 1 files # Total size: mean 619107.3 sd 3685486.1 min 1000 (AHKG01114633) # max 78885850 (JH602043) median 1736 # %27.63 masked total, %31.25 masked real # how much does this window masker and repeat masker overlap: featureBits -countGaps hetGla2 rmsk windowmaskerSdust # 360810714 bases of 2618204639 (13.781%) in intersection ######################################################################### # MASK SEQUENCE WITH WM+TRF (DONE - 2012-04-13 - Hiram) # since rmsk masks so very little of the genome, use the clean WM mask # on the genome sequence # cd /hive/data/genomes/hetGla2 # twoBitMask -add bed/windowMasker/hetGla2.cleanWMSdust.2bit \ # bed/simpleRepeat/trfMask.bed hetGla2.2bit # safe to ignore the warnings about BED file with >=13 fields # twoBitToFa hetGla2.2bit stdout | faSize stdin > faSize.hetGla2.txt # cat faSize.hetGla2.txt # 927696114 bases (111611440 N's 816084674 real 607935500 upper # 208149174 lower) in 5678 sequences in 1 files # Total size: mean 163384.3 sd 1922194.0 min 1000 (AERX01077754-1) # max 51042256 (chrLG7) median 2009 # %22.44 masked total, %25.51 masked real # create symlink to gbdb # rm /gbdb/hetGla2/hetGla2.2bit # ln -s `pwd`/hetGla2.2bit /gbdb/hetGla2/hetGla2.2bit ######################################################################## # cpgIslands - (DONE - 2011-04-23 - Hiram) mkdir /hive/data/genomes/hetGla2/bed/cpgIslands cd /hive/data/genomes/hetGla2/bed/cpgIslands time doCpgIslands.pl hetGla2 > do.log 2>&1 # real 7m57.656s cat fb.hetGla2.cpgIslandExt.txt # 21205888 bases of 2314771103 (0.916%) in intersection ######################################################################### # genscan - (DONE - 2011-04-25 - Hiram) mkdir /hive/data/genomes/hetGla2/bed/genscan cd /hive/data/genomes/hetGla2/bed/genscan time doGenscan.pl hetGla2 > do.log 2>&1 # real 32m10.608s # Completed: 4227 of 4229 jobs # Crashed: 2 jobs # CPU time in finished jobs: 52691s 878.18m 14.64h 0.61d 0.002 y # IO & Wait Time: 15228s 253.80m 4.23h 0.18d 0.000 y # Average job time: 16s 0.27m 0.00h 0.00d # Longest finished job: 1770s 29.50m 0.49h 0.02d # Submission to last job: 1993s 33.22m 0.55h 0.02d # two broken jobs, reducing the window size to 2000000 seems to work: ./runGsBig.csh JH602052 000 gtf/000/JH602052.gtf pep/000/JH602052.pep subopt/000/JH602052.bed & ./runGsBig.csh JH602053 000 gtf/000/JH602053.gtf pep/000/JH602053.pep subopt/000/JH602053.bed # real real 12m3.901s # continuing: time doGenscan.pl -continue=makeBed hetGla2 > makeBed.log 2>&1 # real 9m49.111s cat fb.hetGla2.genscan.txt # 65077826 bases of 2314771103 (2.811%) in intersection cat fb.hetGla2.genscanSubopt.txt # 60777320 bases of 2314771103 (2.626%) in intersection ######################################################################### # MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2012-05-07 - Hiram) # Use -repMatch=900, based on size -- for human we use 1024 # use the "real" number from the faSize measurement, # hg19 is 2897316137, calculate the ratio factor for 1024: calc \( 2314771103 / 2897316137 \) \* 1024 # ( 2314771103 / 2897316137 ) * 1024 = 818.110795 # round up to 850 (hetGla1 was 850) cd /hive/data/genomes/hetGla2 time blat hetGla2.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=jkStuff/hetGla2.11.ooc -repMatch=850 # Wrote 26730 overused 11-mers to jkStuff/hetGla2.11.ooc # real 0m55.389s # hetGla1 was: Wrote 33104 overused 11-mers to jkStuff/hetGla1.11.ooc # there are no non-bridged gaps, no lift file needed for genbank hgsql -N -e "select bridge from gap;" hetGla2 | sort | uniq -c # 110424 yes # cd /hive/data/genomes/hetGla2/jkStuff # gapToLift hetGla2 hetGla2.nonBridged.lift -bedFile=hetGla2.nonBridged.bed # largest non-bridged contig: # awk '{print $3-$2,$0}' hetGla2.nonBridged.bed | sort -nr | head # 123773608 chrX 95534 123869142 chrX.01 ######################################################################### # AUTO UPDATE GENBANK (DONE - 2012-05-07 - Hiram) # examine the file: /cluster/data/genbank/data/organism.lst # for your species to see what counts it has for: # organism mrnaCnt estCnt refSeqCnt # Heterocephalus glaber 45 0 0 # to decide which "native" mrna or ests you want to specify in genbank.conf ssh hgwdev cd $HOME/kent/src/hg/makeDb/genbank git pull # edit etc/genbank.conf to add hetGla2 just after ce2 # hetGla2 (naked mole rat) hetGla2.serverGenome = /hive/data/genomes/hetGla2/hetGla2.2bit hetGla2.clusterGenome = /hive/data/genomes/hetGla2/hetGla2.2bit hetGla2.ooc = /hive/data/genomes/hetGla2/jkStuff/hetGla2.11.ooc hetGla2.lift = no hetGla2.refseq.mrna.native.pslCDnaFilter = ${lowCover.refseq.mrna.native.pslCDnaFilter} hetGla2.refseq.mrna.xeno.pslCDnaFilter = ${lowCover.refseq.mrna.xeno.pslCDnaFilter} hetGla2.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter} hetGla2.genbank.mrna.xeno.pslCDnaFilter = ${lowCover.genbank.mrna.xeno.pslCDnaFilter} hetGla2.genbank.est.native.pslCDnaFilter = ${lowCover.genbank.est.native.pslCDnaFilter} hetGla2.refseq.mrna.native.load = no hetGla2.refseq.mrna.xeno.load = yes hetGla2.genbank.mrna.native.load = no hetGla2.genbank.mrna.xeno.load = yes hetGla2.genbank.est.native.load = no hetGla2.downloadDir = hetGla2 hetGla2.perChromTables = no # end of section added to etc/genbank.conf git commit -m "adding hetGla2 naked mole rat" etc/genbank.conf git push make etc-update ssh hgwdev # used to do this on "genbank" machine screen -S hetGla2 # long running job managed in screen cd /cluster/data/genbank time nice -n +19 ./bin/gbAlignStep -initial hetGla2 & # var/build/logs/2012.05.07-10:10:42.hetGla2.initalign.log # real 1716m59.594s # load database when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad hetGla2 & # logFile: var/dbload/hgwdev/logs/2012.05.08-16:05:34.dbload.log # real 64m32.488s # enable daily alignment and update of hgwdev (DONE - 2012-02-09 - Hiram) cd ~/kent/src/hg/makeDb/genbank git pull # add hetGla2 to: vi etc/align.dbs etc/hgwdev.dbs git commit -m "Added hetGla2." etc/align.dbs etc/hgwdev.dbs git push make etc-update ######################################################################### # set default position to RHO gene displays (DONE - 2012-07-23 - Hiram) hgsql -e \ 'update dbDb set defaultPos="JH602185:522994-528500" where name="hetGla2";' \ hgcentraltest ############################################################################ # pushQ entry (DONE - 2012-07-23 - Hiram) mkdir /hive/data/genomes/hetGla2/pushQ cd /hive/data/genomes/hetGla2/pushQ # Mark says don't let the transMap track get there time makePushQSql.pl hetGla2 2> stderr.txt | grep -v transMap > hetGla2.sql # real 3m59.755s scp -p hetGla2.sql hgwbeta:/tmp ssh hgwbeta cd /tmp hgsql qapushq < hetGla2.sql ############################################################################