# for emacs: -*- mode: sh; -*- # DATE: 22-May-2012 # ORGANISM: Chinchilla lanigera # TAXID: 34839 # ASSEMBLY LONG NAME: ChiLan1.0 # ASSEMBLY SHORT NAME: ChiLan1.0 # ASSEMBLY SUBMITTER: Broad Institute # ASSEMBLY TYPE: Haploid # NUMBER OF ASSEMBLY-UNITS: 1 # ASSEMBLY ACCESSION: GCA_000276665.1 # FTP-RELEASE DATE: 09-Jul-2012 # http://www.ncbi.nlm.nih.gov/genome/10466 # http://www.ncbi.nlm.nih.gov/genome/assembly/397218/ # http://www.ncbi.nlm.nih.gov/bioproject/68239 # http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AGCD01 # Genome Coverage : 87x Illumina # http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=34839 # rsync:://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Chinchilla_lanigera/ChiLan1.0/ ########################################################################## # Download sequence (DONE - 2012-07-27 - Hiram) mkdir /hive/data/genomes/chiLan1 cd /hive/data/genomes/chiLan1 mkdir genbank cd genbank rsync -a -P \ rsync://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Chinchilla_lanigera/ChiLan1.0/ ./ # verify the size of the sequence here: faSize Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz # 2390852391 bases (106575991 N's 2284276400 real 2284276400 upper 0 lower) # in 2838 sequences in 1 files # Total size: mean 842442.7 sd 4383001.2 min 1000 # (gi|393809579|gb|AGCD01081635.1|) # max 63520090 (gi|394364413|gb|JH721862.1|) median 2911 # strip the names down to something reasonable, they can all be: # >JH7[0-9] or >AGCD01[0-9] zcat Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz \ | sed -e "s/^>.*JH7/>JH7/; s/^>.*AGCD01/>AGCD01/; s/.1. Chinchilla .*//" \ | gzip -c > ucsc.fa.gz zcat Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz \ | sed -e "s/^\(JH7[0-9]*\).1/\1/; s/^\(AGCD01[0-9]*\).1/\1/" \ | gzip -c > ucsc.agp.gz checkAgpAndFa ucsc.agp.gz ucsc.fa.gz 2>&1 | tail -2 # Valid Fasta file entry # All AGP and FASTA entries agree - both files are valid mkdir /hive/data/genomes/chiLan1/photograph cd /hive/data/genomes/chiLan1/photograph # from: http://commons.wikimedia.org/wiki/File:Chinchilla_lanigera_%28Wroclaw_zoo%29-2.JPG # http://commons.wikimedia.org/wiki/User:Gu%C3%A9rin_Nicolas wget -O Chinchilla_lanigera_Guerin_Nicolas.jpg \ http://upload.wikimedia.org/wikipedia/commons/1/18/Chinchilla_lanigera_%28Wroclaw_zoo%29-2.JPG convert -geometry "350x350" Chinchilla_lanigera_Guerin_Nicolas.jpg \ Chinchilla_lanigera.jpg # check this .jpg file into the source tree kent/src/hg/htdocs/images/ git commit -m "from Wikimedia Commons - Guerin Nicolas" Chinchilla_lanigera.jpg # and copy to /usr/local/apache/htdocs/images cp -p Geospiza_fortis.jpg /usr/local/apache/htdocs/images ########################################################################## # Initial makeGenomeDb.pl (DONE - 2012-07-27 - Hiram) # obtain a template for this from the source tree: # kent/src/hg/utils/automation/configFiles/ # and check it back into the source tree when completed here: cd /hive/data/genomes/chiLan1 cat << '_EOF_' > chiLan1.config.ra # Config parameters for makeGenomeDb.pl: db chiLan1 clade mammal genomeCladePriority 35 scientificName Chinchilla lanigera commonName Chinchilla assemblyDate May 2012 assemblyLabel Broad Institute assemblyShortLabel ChiLan1.0 orderKey 1700 mitoAcc none fastaFiles /hive/data/genomes/chiLan1/genbank/ucsc.fa.gz agpFiles /hive/data/genomes/chiLan1/genbank/ucsc.agp.gz # qualFiles none dbDbSpeciesDir chiLan photoCreditURL http://commons.wikimedia.org/wiki/File:Chinchilla_lanigera_%28Wroclaw_zoo%29-2.JPG photoCreditName Photo courtesy of Gu%C3%A9rin Nicolas, Wikimedia Commons ncbiGenomeId 10466 ncbiAssemblyId 397218 ncbiAssemblyName ChiLan1.0 ncbiBioProject 68239 genBankAccessionID GCA_000276665.1 taxId 34839 '_EOF_' # << happy emacs time makeGenomeDb.pl -workhorse=hgwdev -fileServer=hgwdev -dbHost=hgwdev \ -stop=agp chiLan1.config.ra > agp.log 2>&1 # real 2m10.088s # verify OK: tail -1 agp.log # *** All done! (through the 'agp' step) # finish it off time makeGenomeDb.pl -continue=db -workhorse=hgwdev -fileServer=hgwdev \ -dbHost=hgwdev chiLan1.config.ra > db.log 2>&1 # real 9m24.133s # add the trackDb entries to the source tree, and the 2bit link: ln -s `pwd`/chiLan1.unmasked.2bit /gbdb/chiLan1/chiLan1.2bit # browser should function now ########################################################################## # running repeat masker (DONE - 2012-07-27 - Hiram) mkdir /hive/data/genomes/chiLan1/bed/repeatMasker cd /hive/data/genomes/chiLan1/bed/repeatMasker time doRepeatMasker.pl -buildDir=`pwd` -noSplit \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ -smallClusterHub=encodek chiLan1 > do.log 2>&1 & # real 650m7.044s cat faSize.rmsk.txt # 2390852391 bases (106575991 N's 2284276400 real 1581183547 upper # 703092853 lower) in 2838 sequences in 1 files # Total size: mean 842442.7 sd 4383001.2 min 1000 (AGCD01081635) # max 63520090 (JH721862) median 2911 # %29.41 masked total, %30.78 masked real egrep -i "versi|relea" do.log # April 26 2011 (open-3-3-0) version of RepeatMasker # CC RELEASE 20110920; # RepeatMasker version development-$Id: RepeatMasker,v 1.26 2011/09/26 16:19:44 angie Exp $ featureBits -countGaps chiLan1 rmsk # 703728919 bases of 2390852391 (29.434%) in intersection # why is it different than the faSize above ? # because rmsk masks out some N's as well as bases, the count above # separates out the N's from the bases, it doesn't show lower case N's ########################################################################## # running simple repeat (DONE - 2012-07-27 - Hiram) mkdir /hive/data/genomes/chiLan1/bed/simpleRepeat cd /hive/data/genomes/chiLan1/bed/simpleRepeat time doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=swarm \ -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=encodek \ chiLan1 > do.log 2>&1 & # real 11m57.551s cat fb.simpleRepeat # 27063329 bases of 2284276400 (1.185%) in intersection ######################################################################### # Verify all gaps are marked, add any N's not in gap as type 'other' # (DONE - 2012-07-27 - Hiram) mkdir /hive/data/genomes/chiLan1/bed/gap cd /hive/data/genomes/chiLan1/bed/gap time nice -n +19 findMotif -motif=gattaca -verbose=4 \ -strand=+ ../../chiLan1.unmasked.2bit > findMotif.txt 2>&1 # real 0m32.055s grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed time featureBits chiLan1 -not gap -bed=notGap.bed # 2284276400 bases of 2284276400 (100.000%) in intersection # real 0m13.196s # can see now if allGaps.bed actually is all the gaps: hgsql -N -e "select size from gap;" chiLan1 | ave stdin | grep total # total 106575991.000000 ave -col=5 allGaps.bed | grep total # total 106575991.000000 # equal counts indicates the rest is unnecessary, they are all marked time featureBits chiLan1 allGaps.bed notGap.bed -bed=new.gaps.bed # 0 bases of 2284276400 (0.000%) in intersection # real 1m20.106s # no new gaps, nothing to do here # check if any non-bridged gaps here: hgsql -N -e "select bridge from gap;" chiLan1 | sort | uniq -c # 78817 yes ########################################################################## ## WINDOWMASKER (DONE - 2012-07-27 - Hiram) mkdir /hive/data/genomes/chiLan1/bed/windowMasker cd /hive/data/genomes/chiLan1/bed/windowMasker time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \ -dbHost=hgwdev chiLan1 > do.log 2>&1 & # real 172m49.925s # Masking statistics cat faSize.chiLan1.wmsk.txt # 2390852391 bases (106575991 N's 2284276400 real 1622493987 upper # 661782413 lower) in 2838 sequences in 1 files # Total size: mean 842442.7 sd 4383001.2 min 1000 (AGCD01081635) # max 63520090 (JH721862) median 2911 # %27.68 masked total, %28.97 masked real cat faSize.chiLan1.wmsk.sdust.txt # 2390852391 bases (106575991 N's 2284276400 real 1611206785 upper # 673069615 lower) in 2838 sequences in 1 files # Total size: mean 842442.7 sd 4383001.2 min 1000 (AGCD01081635) # max 63520090 (JH721862) median 2911 # %28.15 masked total, %29.47 masked real cat faSize.chiLan1.cleanWMSdust.txt # 2390852391 bases (106575991 N's 2284276400 real 1611206785 upper # 673069615 lower) in 2838 sequences in 1 files # Total size: mean 842442.7 sd 4383001.2 min 1000 (AGCD01081635) # max 63520090 (JH721862) median 2911 # %28.15 masked total, %29.47 masked real cat fb.chiLan1.windowmaskerSdust.clean.txt # 673069615 bases of 2390852391 (28.152%) in intersection # how much does this window masker and repeat masker overlap: featureBits -countGaps chiLan1 rmsk windowmaskerSdust # 332330878 bases of 2390852391 (13.900%) in intersection ########################################################################## # add simpleRepeats to RepeatMasker result (DONE - 2012-07-30 - Hiram) # add to rmsk after it is done: cd /hive/data/genomes/chiLan1 twoBitMask -add chiLan1.rmsk.2bit \ bed/simpleRepeat/trfMask.bed chiLan1.2bit # you can safely ignore the warning about fields >= 13 twoBitToFa chiLan1.2bit stdout | faSize stdin > faSize.chiLan1.2bit.txt cat faSize.chiLan1.2bit.txt # 2390852391 bases (106575991 N's 2284276400 real 1580253341 upper # 704023059 lower) in 2838 sequences in 1 files # Total size: mean 842442.7 sd 4383001.2 min 1000 (AGCD01081635) # max 63520090 (JH721862) median 2911 # %29.45 masked total, %30.82 masked real rm /gbdb/chiLan1/chiLan1.2bit ln -s `pwd`/chiLan1.2bit /gbdb/chiLan1/chiLan1.2bit ########################################################################## # cpgIslands - (DONE - 2012-07-27 - Hiram) mkdir /hive/data/genomes/chiLan1/bed/cpgIslands cd /hive/data/genomes/chiLan1/bed/cpgIslands time doCpgIslands.pl chiLan1 > do.log 2>&1 # real 24m13.577s cat fb.chiLan1.cpgIslandExt.txt # 27037127 bases of 2284276400 (1.184%) in intersection ######################################################################### # genscan - (DONE - 2012-07-27 - Hiram) mkdir /hive/data/genomes/chiLan1/bed/genscan cd /hive/data/genomes/chiLan1/bed/genscan time doGenscan.pl chiLan1 > do.log 2>&1 # real 34m54.484s # two failed jobs # Completed: 2836 of 2838 jobs # Crashed: 2 jobs # CPU time in finished jobs: 49352s 822.54m 13.71h 0.57d 0.002 y # IO & Wait Time: 12838s 213.96m 3.57h 0.15d 0.000 y # Average job time: 22s 0.37m 0.01h 0.00d # Longest finished job: 1435s 23.92m 0.40h 0.02d # Submission to last job: 1543s 25.72m 0.43h 0.02d # changing the window size to -window=2000000 and running: ./testGsBig.csh JH721897 000 gtf/000/JH721897.gtf pep/000/JH721897.pep \ subopt/000/JH721897.bed # real 8m58.351s ./testGsBig.csh JH721865 000 gtf/000/JH721865.gtf pep/000/JH721865.pep subopt/000/JH721865.bed # real 13m56.143s # continuing time doGenscan.pl -continue=makeBed chiLan1 > bed.log 2>&1 # real 3m24.229s cat fb.chiLan1.genscan.txt # 65590106 bases of 2284276400 (2.871%) in intersection cat fb.chiLan1.genscanSubopt.txt # 54325275 bases of 2284276400 (2.378%) in intersection ######################################################################### # MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2012-07-27 - Hiram) # Use -repMatch=850, based on size -- for human we use 1024 # use the "real" number from the faSize measurement, # hg19 is 2897316137, calculate the ratio factor for 1024: # 2390852391 bases (106575991 N's 2284276400 real 1580253341 upper calc \( 2284276400 / 2897316137 \) \* 1024 # ( 2284276400 / 2897316137 ) * 1024 = 807.333036 # round up to 850 (orycun2 was also 920) cd /hive/data/genomes/chiLan1 time blat chiLan1.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=jkStuff/chiLan1.11.ooc -repMatch=850 # Wrote 20262 overused 11-mers to jkStuff/chiLan1.11.ooc # real 0m54.278s # oryCun2 was: Wrote 32543 overused 11-mers to jkStuff/oryCun2.11.ooc # there are no non-bridged gaps, no lift file needed for genbank hgsql -N -e "select bridge from gap;" chiLan1 | sort | uniq -c # 78817 yes # cd /hive/data/genomes/chiLan1/jkStuff # gapToLift chiLan1 chiLan1.nonBridged.lift -bedFile=chiLan1.nonBridged.bed # largest non-bridged contig: # awk '{print $3-$2,$0}' chiLan1.nonBridged.bed | sort -nr | head # 123773608 chrX 95534 123869142 chrX.01 ######################################################################### # AUTO UPDATE GENBANK (DONE - 2012-07-27 - Hiram) # examine the file: /cluster/data/genbank/data/organism.lst # for your species to see what counts it has for: # organism mrnaCnt estCnt refSeqCnt # Chinchilla lanigera 45 1228 0 # to decide which "native" mrna or ests you want to specify in genbank.conf ssh hgwdev cd $HOME/kent/src/hg/makeDb/genbank git pull # edit etc/genbank.conf to add chiLan1 just before oryLat2 # chiLan1 (Chinchilla lanigera) chiLan1.serverGenome = /hive/data/genomes/chiLan1/chiLan1.2bit chiLan1.clusterGenome = /hive/data/genomes/chiLan1/chiLan1.2bit chiLan1.lift = no chiLan1.perChromTables = no chiLan1.refseq.mrna.native.pslCDnaFilter = ${ordered.refseq.mrna.native.pslCDnaFilter} chiLan1.refseq.mrna.xeno.pslCDnaFilter = ${ordered.refseq.mrna.xeno.pslCDnaFilter} chiLan1.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter} chiLan1.genbank.mrna.xeno.pslCDnaFilter = ${ordered.genbank.mrna.xeno.pslCDnaFilter} chiLan1.genbank.est.native.pslCDnaFilter = ${ordered.genbank.est.native.pslCDnaFilter} chiLan1.ooc = /hive/data/genomes/chiLan1/jkStuff/chiLan1.11.ooc chiLan1.refseq.mrna.native.load = yes chiLan1.refseq.mrna.xeno.load = yes chiLan1.genbank.mrna.xeno.load = no chiLan1.genbank.est.native.load = yes chiLan1.downloadDir = chiLan1 # end of section added to etc/genbank.conf git commit -m "adding chiLan1 Chinchilla" etc/genbank.conf git push make etc-update git pull # Edit src/lib/gbGenome.c to add new species. git commit -m "adding definition for chiLanNames" src/lib/gbGenome.c git push make install-server ssh hgwdev # used to do this on "genbank" machine screen -S chiLan1 # long running job managed in screen cd /cluster/data/genbank time nice -n +19 ./bin/gbAlignStep -initial chiLan1 & # var/build/logs/2012.07.30-09:39:12.chiLan1.initalign.log # real 345m41.104s # load database when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad chiLan1 & # real 20m50.723s # var/dbload/hgwdev/logs/2012.07.30-21:45:49.dbload.log # enable daily alignment and update of hgwdev (DONE - 2012-05-09 - Hiram) cd ~/kent/src/hg/makeDb/genbank git pull # add chiLan1 to: vi etc/align.dbs etc/hgwdev.dbs git commit -m "Added chiLan1." etc/align.dbs etc/hgwdev.dbs git push make etc-update #########################################################################