# for emacs: -*- mode: sh; -*- # This file describes browser build for the gadMor1 # Atlantic Cod Gadus morhua genome: May 2010 # http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=CAEA01 # WGS, est. size: 830 Mbp # A new page of information at NCBI: # http://www.ncbi.nlm.nih.gov/genome/2661 ############################################################################# # Fetch sequence from genbank (DONE - 2011-12-22 - Hiram) mkdir -p /hive/data/genomes/gadMor1/genbank cd /hive/data/genomes/gadMor1/genbank wget --timestamping -r --cut-dirs=6 --level=0 -nH -x \ --no-remove-listing -np \ "ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_other/Gadus_morhua/GadMor_May2010/*" # measure total sequence in this assembly faSize Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz # 824311139 bases (216289280 N's 608021859 real 608021859 upper 0 lower) in 427427 sequences in 1 files # Total size: mean 1928.5 sd 38925.4 min 100 (gi|344901537|emb|CAEA01000187.1|) max 4999318 (gi|344906074|emb|HE569754.1|) median 376 # N count: mean 506.0 sd 12840.6 # U count: mean 1422.5 sd 26292.8 # L count: mean 0.0 sd 0.0 # %0.00 masked total, %0.00 masked real ############################################################################# # process into UCSC naming scheme (DONE - 2011-12-22 - Hiram) cd /hive/data/genomes/gadMor1/genbank cat << '_EOF_' > unplaced.pl #!/bin/env perl use strict; use warnings; my $agpFile = "../genbank/Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz"; my $fastaFile = "../genbank/Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz"; open (FH, "zcat $agpFile|") or die "can not read $agpFile"; open (UC, ">unplaced.agp") or die "can not write to unplaced.agp"; while (my $line = ) { if ($line =~ m/^#/) { print UC $line; } else { $line =~ s/\.1//; printf UC "%s", $line; } } close (FH); close (UC); open (FH, "zcat $fastaFile|") or die "can not read $fastaFile"; open (UC, ">unplaced.fa") or die "can not write to unplaced.fa"; while (my $line = ) { if ($line =~ m/^>/) { chomp $line; $line =~ s/.*emb\|//; $line =~ s/\.1\|.*//; printf UC ">$line\n"; } else { print UC $line; } } close (FH); close (UC); '_EOF_' # << happy emacs chmod +x unplaced.pl time ./unplaced.pl # real 0m28.106s # compress these files gzip *.fa *.agp # verify nothing has changed in the sequence, should be the same as above: faSize unplaced.fa.gz # 824311139 bases (216289280 N's 608021859 real 608021859 upper 0 lower) in # 427427 sequences in 1 files # Total size: mean 1928.5 sd 38925.4 min 100 (CAEA01000187) max 4999318 (HE569754) median 376 # N count: mean 506.0 sd 12840.6 # U count: mean 1422.5 sd 26292.8 # L count: mean 0.0 sd 0.0 # %0.00 masked total, %0.00 masked real ############################################################################# # Initial database build (DONE - 2011-12-22 - Hiram) cd /hive/data/genomes/gadMor1 cat << '_EOF_' > gadMor1.config.ra # Config parameters for makeGenomeDb.pl: db gadMor1 clade vertebrate genomeCladePriority 125 scientificName Gadus morhua commonName Atlantic cod assemblyDate May. 2010 assemblyLabel Genofisk GadMor_May2010 (GCA_000231765.1) assemblyShortLabel Genofisk GadMor_May2010 orderKey 478 mitoAcc NC_002081 fastaFiles /hive/data/genomes/gadMor1/genbank/unplaced.fa.gz agpFiles /hive/data/genomes/gadMor1/genbank/unplaced.agp.gz dbDbSpeciesDir gadMor taxId 8049 '_EOF_' # << happy emacs # first verify the sequence and AGP files are OK time makeGenomeDb.pl -stop=agp -workhorse=hgwdev gadMor1.config.ra \ > agp.log 2>&1 # real 0m53.946s # verify that was OK, look at the agp.log file time makeGenomeDb.pl -continue=db -workhorse=hgwdev gadMor1.config.ra \ > db.log 2>&1 # real 10m45.982s # verify that was OK, look at the do.log file # copy the trackDb business to the source tree, check it in and add # to the trackDb/makefile ############################################################################# # running repeat masker (DONE - 2011-12-29 - Hiram) mkdir /hive/data/genomes/gadMor1/bed/repeatMasker cd /hive/data/genomes/gadMor1/bed/repeatMasker time doRepeatMasker.pl -buildDir=`pwd` -noSplit \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ -smallClusterHub=memk gadMor1 > do.log 2>&1 & # real 265m32.852s cat faSize.rmsk.txt # 824327835 bases (216289280 N's 608038555 real 541851562 upper # 66186993 lower) in 427428 sequences in 1 files # %8.03 masked total, %10.89 masked real grep -i versi do.log # RepeatMasker version development-$Id: RepeatMasker,v 1.26 2011/09/26 16:19:44 angie Exp $ # April 26 2011 (open-3-3-0) version of RepeatMasker time featureBits -countGaps gadMor1 rmsk # real 3m0.062s # 66224843 bases of 824327835 (8.034%) in intersection # why is it different than the faSize above ? # because rmsk masks out some N's as well as bases, the count above # separates out the N's from the bases, it doesn't show lower case N's ########################################################################## # running simple repeat (DONE - 2011-12-29-2012-01-04 - Hiram) mkdir /hive/data/genomes/gadMor1/bed/simpleRepeat cd /hive/data/genomes/gadMor1/bed/simpleRepeat time doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=swarm \ -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=memk \ gadMor1 > do.log 2>&1 & # real 2177m47.881s # two failed jobs, running them on hgwdev to see what fails: ./TrfRun.csh /hive/data/genomes/gadMor1/TrfPart/013/013.lst.bed > 013.log 2>&1 & ./TrfRun.csh /hive/data/genomes/gadMor1/TrfPart/015/015.lst.bed > 015.log 2>&1 # real 347m1.394s # These two jobs are failing because the lift file splits in so many # pieces that there is a name SplitLft.gz which is interpreted as # a gzipped file to the liftUp command. These needed to be done # manually with numeric extensions to three places: cd /scratch/tmp/doSimpleRepeat.cluster.M32046 split -a 3 -d -l 500 \ /hive/data/genomes/gadMor1/TrfPart/013/013.lft SplitLft. foreach splitLft (SplitLft.*) set bedFiles = `awk '{print $2 ".bed"};' $splitLft` endsInLf -zeroOk $bedFiles cat $bedFiles \ | liftUp -type=.bed tmpOut.$splitLft $splitLft error stdin end cat tmpOut.* > tmpOut__bed endsInLf -zeroOk tmpOut* cp -p tmpOut__bed /hive/data/genomes/gadMor1/TrfPart/013/013.lst.bed cd /scratch/tmp/doSimpleRepeat.cluster.L32048 split -a 3 -d -l 500 \ /hive/data/genomes/gadMor1/TrfPart/015/015.lft SplitLft. foreach splitLft (SplitLft.*) set bedFiles = `awk '{print $2 ".bed"};' $splitLft` endsInLf -zeroOk $bedFiles cat $bedFiles \ | liftUp -type=.bed tmpOut.$splitLft $splitLft error stdin end cat tmpOut.* > tmpOut__bed endsInLf -zeroOk tmpOut* cp -p tmpOut__bed /hive/data/genomes/gadMor1/TrfPart/015/015.lst.bed time doSimpleRepeat.pl -continue=filter -buildDir=`pwd` \ -bigClusterHub=swarm \ -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=memk \ gadMor1 > filter.log 2>&1 & # real 3m29.203s cat fb.simpleRepeat # 62230728 bases of 608038597 (10.235%) in intersection # do *not* add this to rmsk here since we are going to use # windowMasker instead of rmsk # add to rmsk after it is done: cd /hive/data/genomes/gadMor1 twoBitMask gadMor1.rmsk.2bit \ -add bed/simpleRepeat/trfMask.bed gadMor1.2bit # you can safely ignore the warning about fields >= 13 twoBitToFa gadMor1.2bit stdout | faSize stdin > faSize.gadMor1.2bit.txt cat faSize.gadMor1.2bit.txt rm /gbdb/gadMor1/gadMor1.2bit ln -s `pwd`/gadMor1.2bit /gbdb/gadMor1/gadMor1.2bit ######################################################################### # Verify all gaps are marked, add any N's not in gap as type 'other' # (DONE - 2011-12-29,2012-01-04 - Hiram) mkdir /hive/data/genomes/gadMor1/bed/gap cd /hive/data/genomes/gadMor1/bed/gap time nice -n +19 findMotif -motif=gattaca -verbose=4 \ -strand=+ ../../gadMor1.unmasked.2bit > findMotif.txt 2>&1 # real 0m17.372s grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed time featureBits gadMor1 -not gap -bed=notGap.bed # real 2m42.073s # 608046566 bases of 608046566 (100.000%) in intersection time featureBits gadMor1 allGaps.bed notGap.bed -bed=new.gaps.bed # real 681m9.272s # 7969 bases of 608046566 (0.001%) in intersection # what is the highest index in the existing gap table: hgsql -N -e "select ix from gap;" gadMor1 | sort -n | tail -1 # 1942 cat << '_EOF_' > mkGap.pl #!/bin/env perl use strict; use warnings; my $ix=`hgsql -N -e "select ix from gap;" gadMor1 | sort -n | tail -1`; chomp $ix; open (FH,") { my ($chrom, $chromStart, $chromEnd, $rest) = split('\s+', $line); ++$ix; printf "%s\t%d\t%d\t%d\tN\t%d\tother\tyes\n", $chrom, $chromStart, $chromEnd, $ix, $chromEnd-$chromStart; } close (FH); '_EOF_' # << happy emacs chmod +x ./mkGap.pl ./mkGap.pl > other.bed featureBits -countGaps gadMor1 other.bed # real 8m36.778s # 7969 bases of 824327835 (0.001%) in intersection wc -l other.bed # 7853 hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/gap.sql \ -noLoad gadMor1 otherGap other.bed # profile of sizes: awk '{print $3-$2}' other.bed | sort | uniq -c # 7763 1 # 78 2 # 5 3 # 3 4 # 2 5 # 1 6 # 1 7 # starting with this many hgsql -e "select count(*) from gap;" gadMor1 # 127442 hgsql gadMor1 -e 'load data local infile "bed.tab" into table gap;' # result count: hgsql -e "select count(*) from gap;" gadMor1 # 135295 # == 127442 + 7853 # verify we aren't adding gaps where gaps already exist # this would output errors if that were true: gapToLift -minGap=1 gadMor1 nonBridged.lift -bedFile=nonBridged.bed # these warnings are misleading, they aren't overlapping gaps, they # are merely adjacent to existing gaps: # WARNING: overlapping gap at HE572041:1726761-1727075(other) and HE572041:1727075-1727076(other) # WARNING: overlapping gap at HE572319:1050744-1050745(other) and HE572319:1050745-1050904(fragment) # there are 27 of these locations # there are no non-bridged gaps here: hgsql -N -e "select bridge from gap;" gadMor1 | sort | uniq -c # 135295 yes ########################################################################## ## WINDOWMASKER (DONE - 2011-12-29,2012-01-04 - Hiram) mkdir /hive/data/genomes/gadMor1/bed/windowMasker cd /hive/data/genomes/gadMor1/bed/windowMasker time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \ -dbHost=hgwdev gadMor1 > do.log 2>&1 & # real 73m45.656s # Masking statistics twoBitToFa gadMor1.wmsk.2bit stdout | faSize stdin # 824327835 bases (216289280 N's 608038555 real 427305862 upper # 180732693 lower) in 427428 sequences in 1 files # %21.92 masked total, %29.72 masked real twoBitToFa gadMor1.wmsk.sdust.2bit stdout | faSize stdin # 824327835 bases (216289280 N's 608038555 real 418637635 upper # 189400920 lower) in 427428 sequences in 1 files # %22.98 masked total, %31.15 masked real hgLoadBed gadMor1 windowmaskerSdust windowmasker.sdust.bed.gz # Loaded 3422833 elements of size 3 featureBits -countGaps gadMor1 windowmaskerSdust # real 5m27.388s # 405686830 bases of 824327835 (49.214%) in intersection # eliminate the gaps from the masking featureBits gadMor1 -not gap -bed=notGap.bed # real 3m11.708s # 608038597 bases of 608038597 (100.000%) in intersection time nice -n +19 featureBits gadMor1 windowmaskerSdust notGap.bed \ -bed=stdout | gzip -c > cleanWMask.bed.gz # real 577m19.317s # 189400922 bases of 608038597 (31.149%) in intersection hgLoadBed gadMor1 windowmaskerSdust cleanWMask.bed.gz # Loaded 3485901 elements of size 4 featureBits -countGaps gadMor1 windowmaskerSdust # 714075485 bases of 1799143587 (39.690%) in intersection # mask the sequence with this clean mask zcat cleanWMask.bed.gz \ | twoBitMask ../../gadMor1.unmasked.2bit stdin \ -type=.bed gadMor1.cleanWMSdust.2bit twoBitToFa gadMor1.cleanWMSdust.2bit stdout | faSize stdin \ > gadMor1.cleanWMSdust.faSize.txt cat gadMor1.cleanWMSdust.faSize.txt # 824327835 bases (216289280 N's 608038555 real 418637635 upper # 189400920 lower) in 427428 sequences in 1 files # %22.98 masked total, %31.15 masked real # how much does this window masker and repeat masker overlap: featureBits -countGaps gadMor1 rmsk windowmaskerSdust # 280229203 bases of 1511735326 (18.537%) in intersection ######################################################################### # MASK SEQUENCE WITH WM+TRF (DONE - 2011-12-04 - Hiram) cd /hive/data/genomes/gadMor1 twoBitMask -add bed/windowMasker/gadMor1.cleanWMSdust.2bit \ bed/simpleRepeat/trfMask.bed gadMor1.2bit # safe to ignore the warnings about BED file with >=13 fields twoBitToFa gadMor1.2bit stdout | faSize stdin > faSize.gadMor1.txt cat faSize.gadMor1.txt # 824327835 bases (216289280 N's 608038555 real 418051473 upper # 189987082 lower) in 427428 sequences in 1 files # %23.05 masked total, %31.25 masked real # create symlink to gbdb ssh hgwdev rm /gbdb/gadMor1/gadMor1.2bit ln -s `pwd`/gadMor1.2bit /gbdb/gadMor1/gadMor1.2bit ######################################################################## # cpgIslandExt (DONE - 2012-04-19 - Hiram) mkdir /hive/data/genomes/gadMor1/bed/cpgIslands cd /hive/data/genomes/gadMor1/bed/cpgIslands # this procedure was done during development of the script to do all # this automatically. time ./doHardMask.csh real 876m43.648s time ./doMakeBed.csh real 24m19.176s time doCpgIslands.pl -continue=load time featureBits gadMor1 cpgIslandExt > fb.gadMor1.cpgIslandExt.txt 2>&1 # real 2m11.771s cat fb.gadMor1.cpgIslandExt.txt # 19965268 bases of 608038597 (3.284%) in intersection ######################################################################## # genscan - (DONE - 2011-04-25 - Hiram) mkdir /hive/data/genomes/gadMor1/bed/genscan cd /hive/data/genomes/gadMor1/bed/genscan time doGenscan.pl gadMor1 > do.log 2>&1 # real 909m27.951s cat fb.gadMor1.genscan.txt # 31133526 bases of 608038597 (5.120%) in intersection cat fb.gadMor1.genscanSubopt.txt # 23477881 bases of 608038597 (3.861%) in intersection ######################################################################### # MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2012-05-03 - Hiram) # Use -repMatch=900, based on size -- for human we use 1024 # use the "real" number from the faSize measurement, # hg19 is 2897316137, calculate the ratio factor for 1024: calc \( 608038555 / 2897316137 \) \* 1024 # ( 608038555 / 2897316137 ) * 1024 = 214.899393 # round up to 250 cd /hive/data/genomes/gadMor1 time blat gadMor1.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=jkStuff/gadMor1.11.ooc -repMatch=250 # Wrote 9115 overused 11-mers to jkStuff/gadMor1.11.ooc # real 0m22.624s # there are no non-bridged gaps, no lift file needed for genbank hgsql -N -e "select bridge from gap;" gadMor1 | sort | uniq -c # 135295 yes # cd /hive/data/genomes/gadMor1/jkStuff # gapToLift gadMor1 gadMor1.nonBridged.lift -bedFile=gadMor1.nonBridged.bed # largest non-bridged contig: # awk '{print $3-$2,$0}' gadMor1.nonBridged.bed | sort -nr | head # 123773608 chrX 95534 123869142 chrX.01 ######################################################################### # AUTO UPDATE GENBANK (DONE - 2012-05-03 - Hiram) # examine the file: /cluster/data/genbank/data/organism.lst # for your species to see what counts it has for: # organism mrnaCnt estCnt refSeqCnt # Gadus morhua 636 255256 0 # to decide which "native" mrna or ests you want to specify in genbank.conf ssh hgwdev cd $HOME/kent/src/hg/makeDb/genbank git pull # edit etc/genbank.conf to add gadMor1 just after ce2 # gadMor1 (Atlantic cod) # Atlantic Cod Gadus morhua genome: May 2010 gadMor1.serverGenome = /hive/data/genomes/gadMor1/gadMor1.2bit gadMor1.clusterGenome = /hive/data/genomes/gadMor1/gadMor1.2bit gadMor1.ooc = /hive/data/genomes/gadMor1/jkStuff/gadMor1.11.ooc gadMor1.lift = no gadMor1.refseq.mrna.native.pslCDnaFilter = ${lowCover.refseq.mrna.native.pslCDnaFilter} gadMor1.refseq.mrna.xeno.pslCDnaFilter = ${lowCover.refseq.mrna.xeno.pslCDnaFilter} gadMor1.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter} gadMor1.genbank.mrna.xeno.pslCDnaFilter = ${lowCover.genbank.mrna.xeno.pslCDnaFilter} gadMor1.genbank.est.native.pslCDnaFilter = ${lowCover.genbank.est.native.pslCDnaFilter} gadMor1.refseq.mrna.native.load = no gadMor1.refseq.mrna.xeno.load = yes gadMor1.genbank.mrna.xeno.load = no gadMor1.genbank.est.native.load = yes gadMor1.downloadDir = gadMor1 gadMor1.perChromTables = no # end of section added to etc/genbank.conf git commit -m "adding gadMor1 Atlantic cod" etc/genbank.conf git push make etc-update git pull # Edit src/lib/gbGenome.c to add new species. git commit -m "adding definition for gadMorNames" \ src/lib/gbGenome.c git push make install-server ssh hgwdev # used to do this on "genbank" machine screen -S gadMor1 # long running job managed in screen cd /cluster/data/genbank time nice -n +19 ./bin/gbAlignStep -initial gadMor1 & # var/build/logs/2012.06.07-16:07:06.gadMor1.initalign.log # real 3488m35.403s # load database when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad gadMor1 & # logFile: var/dbload/hgwdev/logs/2012.06.11-09:25:12.dbload.log # real 43m4.176s # enable daily alignment and update of hgwdev (DONE - 2012-05-11 - Hiram) cd ~/kent/src/hg/makeDb/genbank git pull # add gadMor1 to: vi etc/align.dbs etc/hgwdev.dbs git commit -m "Added gadMor1." etc/align.dbs etc/hgwdev.dbs git push make etc-update ######################################################################### # set default position to RHO gene displays (DONE - 2012-07-23 - Hiram) hgsql -e \ 'update dbDb set defaultPos="HE567617:3931-7549" where name="gadMor1";' \ hgcentraltest ############################################################################ # pushQ entry (DONE - 2012-07-23 - Hiram) mkdir /hive/data/genomes/gadMor1/pushQ cd /hive/data/genomes/gadMor1/pushQ # Mark says don't let the transMap track get there time makePushQSql.pl gadMor1 2> stderr.txt | grep -v transMap > gadMor1.sql # real 3m56.143s scp -p gadMor1.sql hgwbeta:/tmp ssh hgwbeta cd /tmp hgsql qapushq < gadMor1.sql ############################################################################