# for emacs: -*- mode: sh; -*- # Gorilla gorilla gorilla - Sanger release 2 ######################################################################### # DOWNLOAD SEQUENCE (DONE - 2009-10-14 - Hiram) mkdir /hive/data/genomes/gorGor2 cd /hive/data/genomes/gorGor2 mkdir sanger cd sanger for F in README ggo_r4-split.fa.gz ggo_r4.agp.gz ggo_r4.fa.gz do wget --timestamping "ftp://ftp.sanger.ac.uk/pub/rd/gorilla/release2/${F}" done # remove blank lines from the fasta file, and fixup MT name: mv ggo_r4-split.fa.gz ggo_r4-split_blankLines.fa.gz zcat ggo_r4-split_blankLines.fa.gz | grep -v "^$" | gzip > ucsc.split.fa.gz zcat ggo_r4.agp.gz | sed -e "s/^MT/chrM/" | gzip > ucsc.agp.gz ######################################################################### # running makeGenomeDb.pl (DONE - 2009-10-28 - Hiram) cat << '_EOF_' > gorGor2.config.ra # Config parameters for makeGenomeDb.pl: db gorGor2 clade mammal genomeCladePriority 12 scientificName Gorilla gorilla gorilla commonName Gorilla assemblyDate Aug. 2009 assemblyLabel Sanger Institute Aug 2009 (NCBI project 31265) orderKey 26 # chrM sequence is included in the release and AGP file mitoAcc none fastaFiles /hive/data/genomes/gorGor2/sanger/ucsc.split.fa.gz agpFiles /hive/data/genomes/gorGor2/sanger/ucsc.agp.gz # qualFiles none dbDbSpeciesDir gorilla taxId 9595 '_EOF_' # << happy emacs # run stepwise to see where problems arise: makeGenomeDb.pl -stop=seq gorGor2.config.ra > seq.log 2>&1 makeGenomeDb.pl -continue=agp -stop=agp gorGor2.config.ra > agp.log 2>&1 makeGenomeDb.pl -continue=db -stop=db gorGor2.config.ra > db.log 2>&1 # failed due to chrom names going to 25 characters and the index doesn't work. # this bug is fixed in later versions of makeGenomeDb.pl cd /hive/data/genomes/gorGor2/bed/chromInfo sed -e "s/16/21/" $HOME/kent/src/hg/lib/chromInfo.sql > chromInfo.sql hgLoadSqlTab gorGor2 chromInfo chromInfo.sql chromInfo.tab makeGenomeDb.pl -continue=db -stop=db gorGor2.config.ra > db.log 2>&1 cd /hive/data/genomes/gorGor2 hgGoldGapGl -noGl gorGor2 gorGor2.agp mkdir -p /gbdb/gorGor2/wib rm -f /gbdb/gorGor2/wib/gc5Base.wib ln -s /cluster/data/gorGor2/bed/gc5Base/gc5Base.wib /gbdb/gorGor2/wib hgLoadWiggle -pathPrefix=/gbdb/gorGor2/wib \ gorGor2 gc5Base /cluster/data/gorGor2/bed/gc5Base/gc5Base.wig rm -f wiggle.tab makeGenomeDb.pl -continue=dbDb -stop=dbDb gorGor2.config.ra > dbDb.log 2>&1 makeGenomeDb.pl -continue=trackDb -stop=trackDb gorGor2.config.ra \ > trackDb.log 2>&1 ########################################################################## # running repeat masker (DONE - 2009-11-02,03 - Hiram) mkdir /hive/data/genomes/gorGor2/bed/repeatMasker cd /hive/data/genomes/gorGor2/bed/repeatMasker doRepeatMasker.pl -buildDir=`pwd` -noSplit -bigClusterHub=swarm \ -workhorse=hgwdev gorGor2 > do.log 2>&1 cat faSize.rmsk.txt # 3035791622 bases (206104414 N's 2829687208 real 1467628472 upper # 1362058736 lower) in 58049 sequences in 1 files # %44.87 masked total, %48.13 masked real ########################################################################## # running simple repeat (DONE - 2009-11-02-2010-02-20 - Hiram) mkdir /hive/data/genomes/gorGor2/bed/simpleRepeats cd /hive/data/genomes/gorGor2/bed/simpleRepeats time doSimpleRepeat.pl -buildDir=`pwd` -smallClusterHub=swarm \ -workhorse=hgwdev gorGor2 > do.log 2>&1 & # there was one very odd bit of sequence in this business # namely: chr4_48641019_10247101 # trf could not complete this one, even after days of trying # broke it up into 130,000 sized chunks and even that took 56 minutes # on swarm. Putting that result together with all of the other # bits in that bunch 037.lst, and continuing: time doSimpleRepeat.pl -buildDir=`pwd` -smallClusterHub=swarm \ -continue=filter -workhorse=hgwdev gorGor2 > filter.log 2>&1 & # real 1m8.940s cat fb.simpleRepeat # 199677936 bases of 2829709756 (7.056%) in intersection # interesting high number there. I wonder if it is from this odd # sequence ? Evidently not just this one, there appear to be others # where regions of more than 100,000 were marked as repeats # eg: chr20_26077633_1186791 marked 500,000 chunk as a repeat cd /hive/data/genomes/gorGor2 twoBitMask gorGor2.rmsk.2bit \ -add bed/simpleRepeats/trfMask.bed gorGor2.2bit # you can safely ignore the warning about fields >= 13 twoBitToFa gorGor2.2bit stdout | faSize stdin > faSize.gorGor2.2bit.txt cat faSize.gorGor2.2bit.txt # 3035791622 bases (206104414 N's 2829687208 real 1466160275 upper # 1363526933 lower) in 58049 sequences in 1 files # %44.92 masked total, %48.19 masked real rm /gbdb/gorGor2/gorGor2.2bit ln -s `pwd`/gorGor2.2bit /gbdb/gorGor2/gorGor2.2bit ######################################################################## # Marking *all* gaps - they are not all in the AGP file # (DONE - 2010-02-20 - Hiram) # verify all gaps are defined in AGP mkdir /hive/data/genomes/gorGor2/bed/allGaps cd /hive/data/genomes/gorGor2/bed/allGaps time findMotif -strand=+ -motif="gattacagattaca" -verbose=4 \ ../../gorGor2.2bit > findMotif.txt 2>&1 & # less than 1 minute grep "^#GAP" findMotif.txt | sed -e "s/#GAP //" \ | sort -k1,1 -k2,2n > allGaps.bed awk '{print $3-$2}' allGaps.bed | ave stdin # total 206104414.000000 featureBits -countGaps gorGor2 gap # 206081866 bases of 3035791622 (6.788%) in intersection # there appear to be more N's than defined in gaps, let's add them # to the gap table time featureBits gorGor2 -not gap -bed=notGap.bed # 0m28.556s time featureBits gorGor2 allGaps.bed notGap.bed -bed=new.gaps.bed # real 246m8.539s # what is the last index in the existing gap table: hgsql -N -e "select ix from gap;" gorGor2 | sort -n | tail -1 # 3998 # each scaffold has its parts renumbered from 1, thus, there # are many more gaps than 3998. All 3998 means is that is the # highest number of gaps on chr4_48641019_10247101 cat << '_EOF_' > mkGap.pl #!/usr/bin/env perl use strict; use warnings; my $ix=`hgsql -N -e "select ix from gap;" gorGor2 | sort -n | tail -1`; chomp $ix; open (FH,") { my ($chrom, $chromStart, $chromEnd, $rest) = split('\s+', $line); ++$ix; printf "%s\t%d\t%d\t%d\tN\t%d\tother\tyes\n", $chrom, $chromStart, $chromEnd, $ix, $chromEnd-$chromStart; } close (FH); '_EOF_' # << happy emacs chmod +x ./mkGap.pl ./mkGap.pl > other.gap hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/gap.sql \ -noLoad gorGor2 otherGap other.gap # Loaded 19438 # adding this many: wc -l bed.tab # 19438 # starting with this many hgsql -e "select count(*) from gap;" gorGor2 # 3998 hgsql gorGor2 -e 'load data local infile "bed.tab" into table gap;' # original number of gaps: grep fragment ../../gorGor2.agp | wc -l # 407798 # + 19438 # = 427236 # result count: hgsql -e "select count(*) from gap;" gorGor2 # 427236 # OK. ########################################################################## # MAKE 11.OOC FILES FOR BLAT (DONE - 2010-02-20 - Hiram) # numerator is gorGor2 gapless bases "real" as reported by faSize # denominator is hg17 gapless bases as reported by featureBits, # 1024 is threshold used for human -repMatch: calc \( 2829687208 / 2897310462 \) \* 1024 # ( 2829687208 / 2897310462 ) * 1024 = 1000.099830 # ==> use -repMatch=1000 according to size scaled down from 1024 for human. cd /hive/data/genomes/gorGor2 blat gorGor2.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=jkStuff/gorGor2.11.ooc -repMatch=1000 # Wrote 29650 overused 11-mers to jkStuff/gorGor2.11.ooc mkdir /hive/data/staging/data/gorGor2 cp -p gorGor2.2bit chrom.sizes jkStuff/gorGor2.11.ooc \ /hive/data/staging/data/gorGor2 # check if there are any non-bridged gaps to contend with: time gapToLift -bedFile=jkStuff/nonBridgedGaps.bed gorGor2 \ jkStuff/gorGor2.nonBridged.lft # real 0m28.560s wc -l chrom.sizes # 58049 wc -l jkStuff/nonBridgedGaps.bed # 58049 # there are no non-bridged gaps, this bed file is the whole genome awk '{print $3}' jkStuff/nonBridgedGaps.bed | ave stdin # total 3035791622.000000 ave -col=2 chrom.sizes # total 3035791622.000000 ########################################################################## # GENBANK AUTO UPDATE (DONE - 2009-09-25 - Hiram) # add the following to genbank/etc/genbank.conf just before # the chimp panTro1 # Gorilla gorGor2.serverGenome = /hive/data/genomes/gorGor2/gorGor2.2bit gorGor2.clusterGenome = /scratch/data/gorGor2/gorGor2.2bit gorGor2.ooc = /hive/data/genomes/gorGor2/jkStuff/gorGor2.11.ooc gorGor2.lift = no gorGor2.perChromTables = no gorGor2.refseq.mrna.native.pslCDnaFilter = ${ordered.refseq.mrna.native.pslCDnaFilter} gorGor2.refseq.mrna.xeno.pslCDnaFilter = ${ordered.refseq.mrna.xeno.pslCDnaFilter} gorGor2.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter} gorGor2.genbank.mrna.xeno.pslCDnaFilter = ${ordered.genbank.mrna.xeno.pslCDnaFilter} gorGor2.genbank.est.native.pslCDnaFilter = ${ordered.genbank.est.native.pslCDnaFilter} gorGor2.genbank.est.xeno.pslCDnaFilter = ${ordered.genbank.est.xeno.pslCDnaFilter} gorGor2.downloadDir = gorGor2 gorGor2.refseq.mrna.native.load = yes gorGor2.refseq.mrna.xeno.load = yes gorGor2.refseq.mrna.xeno.loadDesc = yes # Edit src/lib/gbGenome.c to add new species. With this line: # static char *gorGorNames[] = {"Gorilla gorilla", "Gorilla gorilla gorilla", "Gorilla gorilla uellensis", "Gorilla gorilla diehli", "Gorilla gorilla graueri", "Gorilla berengi", NULL}; cvs ci -m "adding names for gorilla species" src/lib/gbGenome.c make install-server ssh genbank screen # control this business with a screen since it takes a while cd /cluster/data/genbank time nice -n +19 bin/gbAlignStep -initial gorGor2 & # logFile: var/build/logs/2010.02.22-08:38:02.gorGor2.initalign.log # real 421m51.236s # load database when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad gorGor2 # logFile: var/dbload/hgwdev/logs/2010.02.22-18:08:15.dbload.log # real 15m40.078s # enable daily alignment and update of hgwdev cd ~/kent/src/hg/makeDb/genbank cvsup # add gorGor2 to: etc/align.dbs etc/hgwdev.dbs cvs ci -m "Added gorGor2 - Gorilla Gorilla" etc/align.dbs etc/hgwdev.dbs make etc-update # done - 2010-02-22 - Hiram ############################################################################ # BLATSERVERS ENTRY (DONE - 2009-08-06 - Hiram) # After getting a blat server assigned by the Blat Server Gods, ssh hgwdev hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("gorGor2", "blat12", "17808", "1", "0"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("gorGor2", "blat12", "17809", "0", "1");' \ hgcentraltest # test it with some sequence ######################################################################### # Set default position to location of RHO as found by blat from hg18 sequence # (DONE - 2010-02-24 - Hiram) hgsql -e \ 'update dbDb set defaultPos="chr3_129946504_1170306:812299-819972" where name="gorGor2";' \ hgcentraltest #########################################################################