# for emacs: -*- mode: sh; -*- # DATE: 03-Apr-2011 # ORGANISM: Mustela putorius furo # TAXID: 9669 # ASSEMBLY LONG NAME: MusPutFur1.0 # ASSEMBLY SHORT NAME: MusPutFur1.0 # ASSEMBLY SUBMITTER: Ferret Genome Sequencing Consortium # ASSEMBLY TYPE: Haploid # NUMBER OF ASSEMBLY-UNITS: 1 # Assembly Accession: GCA_000215625.1 # http://www.ncbi.nlm.nih.gov/genome/3295 # http://www.ncbi.nlm.nih.gov/assembly/286418/ # http://www.ncbi.nlm.nih.gov/bioproject/59869 # http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AEYP01 # Genome Coverage : 162x # http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=9669 # rsync://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Mustela_putorius/MusPutFur1.0/ ########################################################################## # Download sequence (DONE - 2013-01-10 - Pauline) # XXX NOTE: There were duplicate contigs in this assembly. To remove the # duplicates, the correspondingly Ensembl release was fetched: mkdir -p /hive/data/genomes/musFur1/ensembl cd /hive/data/genomes/musFur1/ensembl wget -r --cut-dirs=5 \ "ftp://ftp.ensembl.org/pub/release-70/fasta/mustela_putorius_furo/dna/" # run an faCount on the toplevel sequence: faCount Mustela_putorius_furo.MusPutFur1.0.70.dna.toplevel.fa.gz \ > toplevel.faCount.txt # generate a chrom.sizes file from that: egrep -v "^#seq|^total" toplevel.faCount.txt | cut -f1,2 | sort -k2nr \ | sed -e "s/\.1//" > ensembl.chrom.sizes # then from the previous attempt at the UCSC build of this sequence: twoBitInfo ../musFur1.unmasked.2bit stdout | sort -k2nr > musFur1.chrom.sizes # then, finding the dups by comparing those two chrom.sizes listings # to look for items with only one entry on the lists, which would be # the extra one in the duplicate contig assembly: egrepStr=`awk '{printf "|%s|%s", $1, $3}' musFur1.twoBitDup.txt | sed -e 's/^.//'` egrep -h "$egrepStr" ensembl.chrom.sizes musFur1.chrom.sizes | cut -f1 \ | sort | uniq -c | awk '$1 < 2' \ | awk '{print $2}' > remove.duplicate.list # now, to remove the duplicates: cd /hive/data/genomes/musFur1/genbank faSomeRecords -exclude ucsc.fa remove.duplicate.list ucsc.noDup.fa egrepStr=`cat remove.duplicate.list | xargs echo | sed -e 's/ /|/g'` egrep -v "$egrepStr" ucsc.agp > ucsc.noDup.agp # using these settings in the musFur1.config.ra file: fastaFiles /hive/data/genomes/musFur1/genbank/ucsc.noDup.fa agpFiles /hive/data/genomes/musFur1/genbank/ucsc.noDup.agp ######################################################################## # the following was the original setup for this build that included the # duplicates: mkdir /hive/data/genomes/musFur1 cd /hive/data/genomes/musFur1 mkdir genbank cd genbank time rsync -a -P \ rsync://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Mustela_putorius/MusPutFur1.0/ ./ # real 3m8.737s # verify the size of the sequence here: faSize Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz # 2410863155 bases (132851443 N's 2278011712 real 2278011712 upper 0 lower) in 7782 sequences in 1 files # Total size: mean 309799.9 sd 1997002.0 min 1000 (gi|334565269|gb|AEYP01117479.1|) max 52375790 (gi|334708367|gb|GL896898.1|) median 1448 zcat unplaced.scaf.agp.gz | awk '{print $1}' | sed -e 's/GL[0-9][0-9]*//; s/AEYP[0-9][0-9]*//' | sort | uniq # .1 zcat unplaced.scaf.agp.gz | sed -e 's/\.1//' > ../../../ucsc.agp zcat Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz \ | sed -e "s/^>.*GL8/>GL8/; s/^>.*AEYP0/>AEYP0/; s/\.1. Mustela.*//" \ > ucsc.fa # strip the names down to something reasonable, they can all be: # >GL[0-9] # since they are all .1 versions: zcat Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz \ | sed -e "s/^>.*GL/>GL/; s/.1. Petromyzon.*//" \ | gzip -c > ucsc.fa.gz zcat Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz \ | sed -e "s/^\(GL[0-9]*\).1/\1/;" | gzip -c > ucsc.agp.gz time checkAgpAndFa ucsc.agp.gz ucsc.fa.gz 2>&1 | tail -2 # Valid Fasta file entry # All AGP and FASTA entries agree - both files are valid # real 0m12.951s mkdir /hive/data/genomes/musFur1/photograph cd /hive/data/genomes/musFur1/photograph wget --timestamping \ http://www.nasa.gov/centers/kennedy/images/content/91159main_93pc780.jpg convert -geometry "220x300" 91159main_93pc780.jpg \ Alligator_Mississippiensis.jpg # check this .jpg file into the source tree kent/src/hg/htdocs/images/ git commit -m "gator photo from NASA Kennedy" Alligator_Mississippiensis.jpg # and copy to /usr/local/apache/htdocs/images cp -p Alligator_Mississippiensis.jpg /usr/local/apache/htdocs/images ########################################################################## # Fetch photograph (DONE - 2013-03-05 - Hiram) # using the photo indicated on the NCBI genome project page: mkdir /hive/data/genomes/musFur1/photo cd /hive/data/genomes/musFur1/photo wget --timestamping \ http://upload.wikimedia.org/wikipedia/commons/3/32/Ferret_2008.png convert -geometry "300x300" Ferret_2008.png Mustela_putorius_furo.png # checkin this file Mustela_putorius_furo.png to the source tree ########################################################################## # Initial makeGenomeDb.pl (DONE - 2013-01-10,03-05 - Pauline, Hiram) # obtain a template for this from the source tree: # kent/src/hg/utils/automation/configFiles/ # and check it back into the source tree when completed here: cd /hive/data/genomes/musFur1 cat << '_EOF_' > musFur1.config.ra # Config parameters for makeGenomeDb.pl: db musFur1 clade vertebrate genomeCladePriority 16 scientificName Mustela putorius furo commonName Ferret assemblyDate Apr. 2011 assemblyLabel Ferret Genome Sequencing Consortium assemblyShortLabel MusPutFur1.0 orderKey 2430 mitoAcc none fastaFiles /hive/data/genomes/musFur1/genbank/ucsc.noDup.fa agpFiles /hive/data/genomes/musFur1/genbank/ucsc.noDup.agp # qualFiles none dbDbSpeciesDir ferret photoCreditURL http://commons.wikimedia.org/wiki/File:Ferret_2008.png photoCreditName Alfredo Gutiérrez and Wikimedia Commons ncbiGenomeId 3295 ncbiAssemblyId 286418 ncbiAssemblyName MusPutFur1.0 ncbiBioProject 59869 genBankAccessionID GCA_000215625.1 taxId 9669 '_EOF_' # << happy emacs time makeGenomeDb.pl -workhorse=hgwdev -fileServer=hgwdev -dbHost=hgwdev \ -stop=agp musFur1.config.ra >>& agp.log # real 1m6.334s # verify OK: tail -1 agp.log # *** All done! (through the 'agp' step) # finish it off time makeGenomeDb.pl -continue=db -workhorse=hgwdev -fileServer=hgwdev \ -dbHost=hgwdev musFur1.config.ra > db.log 2>&1 # real 17m58.217s # XXX failed in the trackDb/html setup due to missing image: # htdocs/images/Mustela_putorius_furo.{jpg|png|gif} # add the trackDb entries to the source tree, and the 2bit link: ln -s `pwd`/musFur1.unmasked.2bit /gbdb/musFur1/musFur1.2bit # browser should function now, add the files from the trackDb # hierarchy here to the source tree # after checking in the photograph and getting it into # /usr/local/apache/htdocs/images: time makeGenomeDb.pl -workhorse=hgwdev -fileServer=hgwdev \ -continue=trackDb -forceDescription -dbHost=hgwdev musFur1.config.ra ########################################################################## # running repeat masker (DONE - 2013-03-05 - Hiram) mkdir /hive/data/genomes/musFur1/bed/repeatMasker cd /hive/data/genomes/musFur1/bed/repeatMasker time doRepeatMasker.pl -buildDir=`pwd` -noSplit \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ -smallClusterHub=encodek musFur1 > do.log 2>&1 & # real 843m37.578s cat faSize.rmsk.txt # 2410758013 bases (132851443 N's 2277906570 real 1390898176 upper # 887008394 lower) in 7741 sequences in 1 files # Total size: mean 311427.2 sd 2002158.7 min 1000 (AEYP01117479) # max 52375790 (GL896898) median 1445 # %36.79 masked total, %38.94 masked real egrep -i "versi|relea" do.log # January 10 2013 (open-4-0-0) version of RepeatMasker # CC RELEASE 20120418; featureBits -countGaps musFur1 rmsk # 887930078 bases of 2410758013 (36.832%) in intersection # why is it different than the faSize above ? # because rmsk masks out some N's as well as bases, the count above # separates out the N's from the bases, it doesn't show lower case N's ########################################################################## # running simple repeat (DONE - 2013-03-05 - Hiram) mkdir /hive/data/genomes/musFur1/bed/simpleRepeat cd /hive/data/genomes/musFur1/bed/simpleRepeat time doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=swarm \ -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=encodek \ musFur1 > do.log 2>&1 & # real 9m42.005s cat fb.simpleRepeat # 25005240 bases of 2277906570 (1.098%) in intersection # add to rmsk after it is done: cd /hive/data/genomes/musFur1 twoBitMask musFur1.rmsk.2bit \ -add bed/simpleRepeat/trfMask.bed musFur1.2bit # you can safely ignore the warning about fields >= 13 twoBitToFa musFur1.2bit stdout | faSize stdin > faSize.musFur1.2bit.txt cat faSize.musFur1.2bit.txt # 2410758013 bases (132851443 N's 2277906570 real 1389998131 upper # 887908439 lower) in 7741 sequences in 1 files # Total size: mean 311427.2 sd 2002158.7 min 1000 (AEYP01117479) # max 52375790 (GL896898) median 1445 # %36.83 masked total, %38.98 masked real rm /gbdb/musFur1/musFur1.2bit ln -s `pwd`/musFur1.2bit /gbdb/musFur1/musFur1.2bit ######################################################################### # Verify all gaps are marked, add any N's not in gap as type 'other' # (DONE - 2013-03-05 - Hiram) mkdir /hive/data/genomes/musFur1/bed/gap cd /hive/data/genomes/musFur1/bed/gap time nice -n +19 findMotif -motif=gattaca -verbose=4 \ -strand=+ ../../musFur1.unmasked.2bit > findMotif.txt 2>&1 # real 0m12.841s grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed time featureBits musFur1 -not gap -bed=notGap.bed # 2277906570 bases of 2277906570 (100.000%) in intersection # real 0m13.015s # can see now if allGaps.bed actually is all the gaps: hgsql -N -e "select size from gap;" musFur1 | ave stdin | grep total # total 132851443.000000 ave -col=5 allGaps.bed | grep total # total 132851443.000000 # same count, no new gaps # check if any non-bridged gaps here: hgsql -N -e "select bridge from gap;" musFur1 | sort | uniq -c # 109700 yes ########################################################################## ## WINDOWMASKER (DONE - 2013-03-05 - Hiram) mkdir /hive/data/genomes/musFur1/bed/windowMasker cd /hive/data/genomes/musFur1/bed/windowMasker time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \ -dbHost=hgwdev musFur1 > do.log 2>&1 & # real 185m5.285s # Masking statistics cat faSize.musFur1.wmsk.txt # 2410758013 bases (132851443 N's 2277906570 real 1575810421 upper # 702096149 lower) in 7741 sequences in 1 files # Total size: mean 311427.2 sd 2002158.7 min 1000 (AEYP01117479) # max 52375790 (GL896898) median 1445 # %29.12 masked total, %30.82 masked real cat faSize.musFur1.wmsk.sdust.txt # 2410758013 bases (132851443 N's 2277906570 real 1563711507 upper # 714195063 lower) in 7741 sequences in 1 files # Total size: mean 311427.2 sd 2002158.7 min 1000 (AEYP01117479) # max 52375790 (GL896898) median 1445 # %29.63 masked total, %31.35 masked real cat faSize.musFur1.cleanWMSdust.txt # 2410758013 bases (132851443 N's 2277906570 real 1563711507 upper # 714195063 lower) in 7741 sequences in 1 files # Total size: mean 311427.2 sd 2002158.7 min 1000 (AEYP01117479) # max 52375790 (GL896898) median 1445 # %29.63 masked total, %31.35 masked real cat fb.musFur1.windowmaskerSdust.clean.txt # 714195063 bases of 2410758013 (29.625%) in intersection # how much does this window masker and repeat masker overlap: # can be done after rmsk is done. The script will often # fail on this command in the doLoad.csh if RM is not yet # complete and these are running at the same time: featureBits -countGaps musFur1 rmsk windowmaskerSdust # 453442864 bases of 2410758013 (18.809%) in intersection # if the script did fail on that command, finish it: time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \ -continue=cleanup -dbHost=hgwdev musFur1 > cleanup.log 2>&1 & # real 1m43.905s ########################################################################## # cpgIslands - (DONE - 2013-03-06 - Hiram) mkdir /hive/data/genomes/musFur1/bed/cpgIslands cd /hive/data/genomes/musFur1/bed/cpgIslands time doCpgIslands.pl musFur1 > do.log 2>&1 # real 38m33.141s cat fb.musFur1.cpgIslandExt.txt # 40222518 bases of 2277906570 (1.766%) in intersection ######################################################################### # genscan - (DONE - 2013-03-06 - Hiram) mkdir /hive/data/genomes/musFur1/bed/genscan cd /hive/data/genomes/musFur1/bed/genscan time doGenscan.pl musFur1 > do.log 2>&1 # real 33m41.740s # one job failed, changed the -window size to 2000000 to get # it to complete ./runLastOne.csh GL896906 000 gtf/000/GL896906.gtf \ pep/000/GL896906.pep subopt/000/GL896906.bed time doGenscan.pl -continue=makeBed musFur1 > bed.log 2>&1 # real 4m33.037s cat fb.musFur1.genscan.txt # 25257909 bases of 647368134 (3.902%) in intersection cat fb.musFur1.genscanSubopt.txt # 23439935 bases of 647368134 (3.621%) in intersection ######################################################################### # MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2013-03-06 - Hiram) # Use -repMatch=400, based on size -- for human we use 1024 # use the "real" number from the faSize measurement, # hg19 is 2897316137, calculate the ratio factor for 1024: calc \( 2277906570 / 2897316137 \) \* 1024 # ( 2277906570 / 2897316137 ) * 1024 = 805.081744 # round up to 350 cd /hive/data/genomes/musFur1 time blat musFur1.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=jkStuff/musFur1.11.ooc -repMatch=350 # Wrote 152295 overused 11-mers to jkStuff/musFur1.11.ooc # real 0m59.156s # there are no non-bridged gaps, no lift file needed for genbank hgsql -N -e "select bridge from gap;" musFur1 | sort | uniq -c # 109700 yes # find other make doc where gapToLift is used to make nonBridged.bed file ######################################################################### # AUTO UPDATE GENBANK (DONE - 2013-03-06 - Hiram) # examine the file: /cluster/data/genbank/data/organism.lst # for your species to see what counts it has for: # organism mrnaCnt estCnt refSeqCnt # Mustela putorius 9 0 0 # Mustela putorius furo 122937 4210 0 # to decide which "native" mrna or ests you want to specify in genbank.conf ssh hgwdev cd $HOME/kent/src/hg/makeDb/genbank git pull # edit etc/genbank.conf to add musFur1 just before petMar1 # musFur1 (Ferret, Mustela putorius furo, taxId 9669) musFur1.serverGenome = /hive/data/genomes/musFur1/musFur1.2bit musFur1.clusterGenome = /hive/data/genomes/musFur1/musFur1.2bit musFur1.ooc = /hive/data/genomes/musFur1/jkStuff/musFur1.11.ooc musFur1.lift = no musFur1.perChromTables = no musFur1.refseq.mrna.native.pslCDnaFilter = ${ordered.refseq.mrna.native.pslCDnaFilter} musFur1.refseq.mrna.xeno.pslCDnaFilter = ${ordered.refseq.mrna.xeno.pslCDnaFilter} musFur1.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter} musFur1.genbank.mrna.xeno.pslCDnaFilter = ${ordered.genbank.mrna.xeno.pslCDnaFilter} musFur1.genbank.est.native.pslCDnaFilter = ${ordered.genbank.est.native.pslCDnaFilter} musFur1.refseq.mrna.native.load = yes musFur1.refseq.mrna.xeno.load = yes musFur1.genbank.mrna.xeno.load = no musFur1.genbank.est.native.load = yes musFur1.downloadDir = musFur1 # end of section added to etc/genbank.conf git commit -m "adding musFur1 ferret redmine 6352" etc/genbank.conf git push make etc-update git pull # Edit src/lib/gbGenome.c to add new species. # musFurNames[] = {"Mustela putorius furo", "Mustela putorius", NULL}; git commit -m "adding definition for musFurNames redmine 6352" \ src/lib/gbGenome.c git push make install-server ssh hgwdev # used to do this on "genbank" machine screen -S musFur1 # long running job managed in screen cd /cluster/data/genbank time nice -n +19 ./bin/gbAlignStep -initial musFur1 & # var/build/logs/2013.03.06-09:49:44.musFur1.initalign.log # real 325m17.355s # load database when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad musFur1 & # real 28m17.094s # var/dbload/hgwdev/logs/2013.03.06-15:15:56.dbload.log # real 31m56.575s # check the end of that dbload.log to see if it was successful # hgwdev 2013.03.06-15:47:47 dbload: finish # enable daily alignment and update of hgwdev (DONE - 2013-03-06 - Hiram) cd ~/kent/src/hg/makeDb/genbank git pull # add musFur1 to: vi etc/align.dbs etc/hgwdev.dbs git commit -m "Added musFur1/ferret to the daily build" \ etc/align.dbs etc/hgwdev.dbs git push make etc-update ############################################################################ # construct ucscToEnsembl chrom name translation (2013-03-06 - Hiram) mkdir /hive/data/genomes/musFur1/bed/ucscToEnsembl cd /hive/data/genomes/musFur1/bed/ucscToEnsembl # all the Ensembl names are the same as UCSC with the addition of .1 cat ../../chrom.sizes | cut -f1 | sed -e 's/^\(.*\)/\1\t\1.1/' \ | sort > ucscToEnsembl.tab # find length of ucsc names for key length in SQL below: awk '{print length($1)}' ucscToEnsembl.tab | sort -u # 12 # 8 cat << '_EOF_' > ucscToEnsembl.sql # UCSC to Ensembl chr name translation CREATE TABLE ucscToEnsembl ( ucsc varchar(255) not null, # UCSC chromosome name ensembl varchar(255) not null, # Ensembl chromosome name #Indices PRIMARY KEY(ucsc(12)) ); '_EOF_' hgsql musFur1 < ucscToEnsembl.sql hgsql musFur1 \ -e 'LOAD DATA LOCAL INFILE "ucscToEnsembl.tab" INTO TABLE ucscToEnsembl' ############################################################################ # Ensembl Genes version 70 (DONE - 2013-03-06 - Hiram) cd /hive/data/genomes/musFur1 cat << '_EOF_' > musFur1.ensGene.ra # required db variable db musFur1 # optional nameTranslation, the sed command that will transform # Ensemble names to UCSC names. With quotes just to protect in perl: nameTranslation 's/^MT/chrM/; s/\.1//' '_EOF_' # << happy emacs doEnsGeneUpdate.pl -ensVersion=70 -stop=process \ musFur1.ensGene.ra > ensGene.process.log 2>&1 # log indicates OK: # genePredCheck -db=musFur1 musFur1.allGenes.gp.gz # checked: 23963 failed: 0 doEnsGeneUpdate.pl -ensVersion=70 -continue=load \ musFur1.ensGene.ra > ensGeneV70.load.log 2>&1 featureBits musFur1 ensGene # 52850166 bases of 2277906570 (2.320%) in intersection ######################################################################### # set default position as recommended from Jeramiah Smith # (DONE - 2012-10-23 - Hiram) hgsql -e \ 'update dbDb set defaultPos="GL476334:480870-830419" where name="musFur1";' \ hgcentraltest ############################################################################ # downloads and pushQ entry (DONE - 2012-03-06 - Hiram) # after adding musFur1 to the all.joiner file and verifying that # joinerCheck is clean, can construct the downloads: cd /hive/data/genomes/musFur1 time makeDownloads.pl -workhorse=hgwdev musFur1 XXX - running - Thu Mar 7 10:02:01 PST 2013 # real 21m55.107s mkdir /hive/data/genomes/musFur1/pushQ cd /hive/data/genomes/musFur1/pushQ # Mark says don't let the transMap track get there time makePushQSql.pl musFur1 2> stderr.txt > musFur1.sql # real 3m38.916s # will have to verify this one after loading on hgwbeta: # WARNING: Could not tell (from trackDb, all.joiner and hardcoded lists of # supporting and genbank tables) which tracks to assign these tables to: # ucscToEnsembl # the script should be fixed to place this in Ensembl Genes track # check the stderr.txt for bad stuff, these kinds of warnings are OK: # WARNING: hgwdev does not have /gbdb/musFur1/wib/gc5Base.wib # WARNING: hgwdev does not have /gbdb/musFur1/wib/quality.wib # WARNING: hgwdev does not have /gbdb/musFur1/bbi/quality.bw # WARNING: musFur1 does not have seq # WARNING: musFur1 does not have extFile scp -p musFur1.sql hgwbeta:/tmp/ ssh hgwbeta "hgsql qapushq < /tmp/musFur1.sql" ########################################################################## # BLATSERVERS ENTRY (DONE - 2012-10-23 - Hiram) # After getting a blat server assigned by the Blat Server Gods, ssh hgwdev hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("musFur1", "blat4b", "17838", "1", "0"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("musFur1", "blat4b", "17839", "0", "1");' \ hgcentraltest # test it with some sequence ############################################################################