# for emacs: -*- mode: sh; -*- # Bos taurus (domestic sheep) -- ISGC Bos_taurus_1.0 (2010-02-01) # file template copied from susScr2.txt Bos taurus - genome overview An agriculturally important animal for beef and milk production. Kingdom: Animals; Group: Mammals Size: 3000 Mb; Haploid chromosomes: 120; Mitochondrion Genome sequencing status: draft assembly Project ID: 10708 Bos taurus genome overview (Project ID: 10708) # Bos taurus (NCBI Project ID: 10708, Accession: GCA_000005525.1) # by International Sheep Genomics Consortium (ISGC) # assembly] sequence: The Bovine Genome Consortia http://genomes.arc.georgetown.edu/drupal/bovine/?q=bovine_genome_consortium Assembled by the Baylor College of Medicine, Human Genome Sequencing Center http://www.bcm.edu/news/packages/bovinegenome.cfm DATE: 14-APR-2009 ORGANISM: Bos_taurus TAXID: 9913 ASSEMBLY LONG NAME: Btau_4.2 ASSEMBLY SHORT NAME: Btau_4.2 ASSEMBLY SUBMITTER: Bovine Genome Sequencing Consortium ASSEMBLY TYPE: Haploid NUMBER OF ASSEMBLY-UNITS: 1 Assembly Accession: GCA_000000095.5 Main NCBI site: BCM site: http://www.hgsc.bcm.tmc.edu/project-species-m-Bovine.hgsc?pageLocation=Bovine # ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Bos_taurus/Btau_4.2/ # ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/ # Bos_taurus/Bos_taurus_1.0 ########################################################################## # Download sequence (DONE 2010-07-12 Chin) mkdir /hive/data/genomes/bosTau5 cd /hive/data/genomes/bosTau5 mkdir genbank cd genbank wget --timestamping -r --cut-dirs=6 --level=0 -nH -x \ --no-remove-listing -np \ "ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Bos_taurus/Btau_4.2/*" # FINISHED --2010-07-13 13:50:10-- # Downloaded: 229 files, 1.9G in 11m 20s (2.82 MB/s) # Read ASSEMBLY_INFO # Looking up chrM in Entrez nucleotide search, http://www.ncbi.nlm.nih.gov/sites/nucleotide # search for Bos taurus mitochondrion complete genome http://www.ncbi.nlm.nih.gov/sites/nucleotide http://www.ncbi.nlm.nih.gov/nuccore/AF492351.1 (May, 2008) and http://www.ncbi.nlm.nih.gov/nuccore/60101824 (Mar, 2009, NC_006853.1 GI:60101824 ) use NC_006853.1 then. mkdir ucscChr # stay at genbank directory # fixup the accession names to become UCSC chrom names export S=Primary_Assembly/assembled_chromosomes cut -f2 ${S}/chr2acc | while read ACC do C=`grep "${ACC}" ${S}/chr2acc | cut -f1` echo "${ACC} -> chr${C}" zcat ${S}/AGP/chr${C}.comp.agp.gz \ | sed -e "s/^${ACC}/chr${C}/" | gzip > ucscChr/chr${C}.agp.gz done export S=Primary_Assembly/assembled_chromosomes cut -f2 ${S}/chr2acc | while read ACC do C=`grep "${ACC}" ${S}/chr2acc | cut -f1` echo "${ACC} -> chr${C}" echo ">chr${C}" > ucscChr/chr${C}.fa zcat ${S}/FASTA/chr${C}.fa.gz | grep -v "^>" >> ucscChr/chr${C}.fa gzip ucscChr/chr${C}.fa & done # Check them with faSize faSize Primary_Assembly/assembled_chromosomes/FASTA/chr*.fa.gz # 2634541797 bases (166882320 N's 2467659477 real 2467659477 upper # 0 lower) in 30 sequences in 30 files faSize ucscChr/chr*.fa.gz # 2634541797 bases (166882320 N's 2467659477 real 2467659477 upper # 0 lower) in 30 sequences in 30 files # For unplaced scalfolds, named them as chrUn_xxxxxxxx # where xxxxxx is the original access id as: chrUn_GL340781.1 # The ".1" at the end need to be filter out since # MySQL does not allow "." as part of the table name and # will casue problems in genbank task step later export S=Primary_Assembly/unplaced_scaffolds zcat ${S}/AGP/unplaced.scaf.agp.gz | grep "^#" > ucscChr/chrUn.agp # append the gap records zcat ${S}/AGP/unplaced.scaf.agp.gz | grep -v "^#" \ | sed -e "s/^/chrUn_/" -e "s/\.1//" >> ucscChr/chrUn.agp gzip ucscChr/chrUn.agp & zcat ${S}/FASTA/unplaced.scaf.fa.gz \ | sed -e "s#^>.*|gb|#>chrUn_#; s#|.*##" -e "s/\.1//" \ | gzip > ucscChr/chrUn.fa.gz # about 11863 sequences in the unplaced zcat ucscChr/chrUn.fa.gz | grep "^>" | wc # 11863 11863 216228 # Check them with faSize faSize Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz # 283532663 bases (18703099 N's 264829564 real 264829564 upper 0 lower) # in 11863 sequences in 1 files faSize ucscChr/chrUn.fa.gz # 283532663 bases (18703099 N's 264829564 real 264829564 upper 0 lower) # in 11863 sequences in 1 files ######################################################################### # Initial makeGenomeDb.pl (DONE - 2010-07-13 - Chin) cd /hive/data/genomes/bosTau5 cat << '_EOF_' > bosTau5.config.ra # Config parameters for makeGenomeDb.pl: db bosTau5 clade mammal genomeCladePriority 31 scientificName Bos taurus commonName Cow assemblyDate Apr, 2009 assemblyLabel BGSC (NCBI project 10708, accession GCA_000000095.5) assemblyShortLabel Baylor Btau_4.2 orderKey 236 mitoAcc 60101824 fastaFiles /hive/data/genomes/bosTau5/genbank/ucscChr/chr*.fa.gz agpFiles /hive/data/genomes/bosTau5/genbank/ucscChr/chr*.agp.gz #qualFiles none dbDbSpeciesDir cow taxId 9913 '_EOF_' # << happy emacs time makeGenomeDb.pl -noGoldGapSplit -workhorse=hgwdev bosTau5.config.ra \ > makeGenomeDb.log 2>&1 & # real 35m49.151s # real real 12m42.419s # add the trackDb entries to the source tree, and the 2bit link: ln -s `pwd`/bosTau5.unmasked.2bit /gbdb/bosTau5/bosTau5.2bit # Follow the instructions in makeGenomeDb.log: # # Search for '***' notes in each file in and make corrections # description.html # /cluster/data/bosTau5/html/{trackDb.ra,gap.html,gold.html} # Then copy these files to your ~/kent/src/hg/makeDb/trackDb/cow/bosTau5 # - cd ~/kent/src/hg/makeDb/trackDb # - edit makefile to add bosTau5 to DBS. # - git add cow/bosTau5/*.{ra,html} # - git commit -m "Added bosTau5 to DBS." makefile # - git commit -m "Initial descriptions for bosTau5." cow/bosTau5/*.{ra.html} # - git pull; git push # - Run make update DBS=bosTau5 and make alpha when done. # - (optional) Clean up /cluster/data/bosTau5/TemporaryTrackDbCheckout # browser should function now # and checkin the *.ra and *.html files. in # /cluster/home/chinhli/kent/src/hg/makeDb/trackDb/sheep/bosTau5 ######################################################################### # RepeatMasker (DONE - 2010-07-14 - Chin) mkdir /hive/data/genomes/bosTau5/bed/repeatMasker cd /hive/data/genomes/bosTau5/bed/repeatMasker time nice -n +19 doRepeatMasker.pl -buildDir=`pwd` \ -workhorse=hgwdev -bigClusterHub=swarm -noSplit bosTau5 > do.log 2>&1 & # real 476m11.135s cat faSize.rmsk.txt # 2918090798 bases (185585419 N's 2732505379 real 1411788756 upper # 1320716623 lower) in 11894 sequences in 1 files # Total size: mean 245341.4 sd 4667229.1 min 999 (chrUn_AAFC03007068) # max 161108518 (chr1) median 7274 # N count: mean 15603.3 sd 298007.1 # U count: mean 118697.6 sd 2276112.1 # L count: mean 111040.6 sd 2100115.1 # %45.26 masked total, %48.33 masked real ######################################################################### # simpleRepeats ( DONE - 2010-07-14 - Chin) mkdir /hive/data/genomes/bosTau5/bed/simpleRepeat cd /hive/data/genomes/bosTau5/bed/simpleRepeat time nice -n +19 doSimpleRepeat.pl -buildDir=`pwd` -workhorse=hgwdev \ -bigClusterHub=swarm -smallClusterHub=encodek bosTau5 > do.log 2>&1 & # real 3m23.411s # step 'trf' failed because part 075 is for chrM, # ignore it, and continue from step filter: # Failed again with filter: output of previous step trf, # /hive/data/genomes/bosTau5/bed/simpleRepeat/run.cluster/run.time , # is required but does not appear to exist. # so cp it from bosTau5: time nice -n +19 doSimpleRepeat.pl -buildDir=`pwd` -workhorse=hgwdev \ -continue filter \ -bigClusterHub=swarm -smallClusterHub=encodek bosTau5 > do.log 2>&1 & # real 0m26.175s cat fb.simpleRepeat # 31611735 bases of 2732715400 (1.157%) in intersection # add to the repeatMasker cd /hive/data/genomes/bosTau5 twoBitMask bosTau5.rmsk.2bit -add bed/simpleRepeat/trfMask.bed bosTau5.2bit # safe to ignore warnings about >=13 fields twoBitToFa bosTau5.2bit stdout | faSize stdin > bosTau5.2bit.faSize.txt cat bosTau5.2bit.faSize.txt # 2918090798 bases (185585419 N's 2732505379 real 1411283094 upper # 1321222285 lower) in 11894 sequences in 1 files # %45.28 masked total, %48.35 masked real ######################################################################### # Marking *all* gaps - they are not all in the AGP file # (DONE - 2010-07-15 - Chin) mkdir /hive/data/genomes/bosTau5/bed/allGaps cd /hive/data/genomes/bosTau5/bed/allGaps time nice -n +19 findMotif -motif=gattaca -verbose=4 \ -strand=+ ../../bosTau5.unmasked.2bit > findMotif.txt 2>&1 # real 0m53.448s grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed featureBits bosTau5 -not gap -bed=notGap.bed # 2732715400 bases of 2732715400 (100.000%) in intersection featureBits bosTau5 allGaps.bed notGap.bed -bed=new.gaps.bed # 210021 bases of 2732715400 (0.008%) in intersection # what is the highest index in the existing gap table: hgsql -N -e "select ix from gap;" bosTau5 | sort -n | tail -1 # 10355 # use tcsh and ctrl-c to create the here doc cat << '_EOF_' > mkGap.pl #!/usr/bin/env perl use strict; use warnings; my $ix=`hgsql -N -e "select ix from gap;" bosTau5 | sort -n | tail -1`; chomp $ix; open (FH,") { my ($chrom, $chromStart, $chromEnd, $rest) = split('\s+', $line); ++$ix; printf "%s\t%d\t%d\t%d\tN\t%d\tother\tyes\n", $chrom, $chromStart, $chromEnd, $ix, $chromEnd-$chromStart; } close (FH); '_EOF_' # << happy emacs chmod +x ./mkGap.pl ./mkGap.pl > other.bed featureBits bosTau5 other.bed # 210021 bases of 2732715400 (0.008%) in intersection hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/gap.sql \ -noLoad bosTau5 otherGap other.bed # Reading other.bed # Loaded 12662 elements of size 8 # Sorted # Saving bed.tab # No load option selected, see file: bed.tab # Loaded 475536 elements of size 8 # adding this many: wc -l bed.tab # 12662 # starting with this many hgsql -e "select count(*) from gap;" bosTau5 # 75436 hgsql bosTau5 -e 'load data local infile "bed.tab" into table gap;' # result count: hgsql -e "select count(*) from gap;" bosTau5 # 88098 = 75436 + 12662 ######################################################################## # Create kluster run files (DONE - 2010-07-18 - Chin) # numerator is bosTau5 gapless bases "real" as reported by: featureBits -noRandom -noHap bosTau5 gap # 166882320 bases of 2467675815 (6.763%) in intersection >>>># 1600136831 bases of 1184628269 (135.075%) in intersection # denominator is hg19 gapless bases as reported by: # featureBits -noRandom -noHap hg19 gap # 234344806 bases of 2861349177 (8.190%) in intersection # 1024 is threshold used for human -repMatch: calc \( 2467675815 / 2861349177 \) \* 1024 # ( 2467675815 / 2861349177 ) * 1024 = 883.114880 # ==> use -repMatch=850 according to size scaled down from 1024 for human. # and rounded down to nearest 50 cd /hive/data/genomes/bosTau5 blat bosTau5.2bit \ /dev/null /dev/null -tileSize=11 -makeOoc=jkStuff/bosTau5.11.ooc \ -repMatch=850 & # Wrote 38895 overused 11-mers to jkStuff/bosTau5.11.ooc mkdir /hive/data/staging/data/bosTau5 cp -p bosTau5.2bit jkStuff/bosTau5.11.ooc /hive/data/staging/data/bosTau5 cp -p chrom.sizes /hive/data/staging/data/bosTau5 # check non-bridged gaps to see what the typical size is: hgsql -N \ -e 'select * from gap where bridge="no" order by size;' bosTau5 \ | sort -k7,7nr # most gaps have size < 1000 # decide on a minimum gap for this break # similar to susScr2, use -minGap=5000 gapToLift -verbose=2 -minGap=5000 bosTau5 jkStuff/nonBridged.lft \ -bedFile=jkStuff/nonBridged.bed cp -p jkStuff/nonBridged.lft \ /hive/data/staging/data/bosTau5/bosTau5.nonBridged.lft # ask cluster-admin to copy (evry time if any file changed) # /hive/data/staging/data/bosTau5 directory to cluster nodes # /scratch/data/bosTau5 ######################################################################## # GENBANK AUTO UPDATE (pending - 2010-07-19 - Chin ) # N/A - since this is a partial browser for liftOver purpose. ############################################################################# # LIFTOVER TO bosTau4 (DONE - 2010-07-19 - Chin) sh hgwdev cd /hive/data/genomes/bosTau5 ln -s jkStuff/bosTau5.11.ooc 11.ooc time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \ -bigClusterHub=pk -dbHost=hgwdev -workhorse=hgwdev \ bosTau5 bosTau4 > doLiftOverToBosTau4.log 2>&1 & pushd /usr/local/apache/htdocs-hgdownload/goldenPath/bosTau5/liftOver/ md5sum bosTau5ToBosTau4.over.chain.gz >> md5sum.txt popd ###################################################################### # all.joiner update, downloads and in pushQ - (DONE 2010-07-19 Chin) cd $HOME/kent/src/hg/makeDb/schema # add bosTau5 after bosTau4 # fixup all.joiner until this is a clean output joinerCheck -database=bosTau5 -all all.joiner mkdir /hive/data/genomes/bosTau5/goldenPath cd /hive/data/genomes/bosTau5/goldenPath makeDownloads.pl bosTau5 > do.log 2>&1 # *** All done! # *** Please take a look at the downloads for bosTau5 using a web browser. # *** The downloads url is: # http://hgdownload-test.cse.ucsc.edu/goldenPath/bosTau5. # *** Edit each README.txt to resolve any notes marked with "***": # /hive/data/genomes/bosTau5/goldenPath/database/README.txt # /hive/data/genomes/bosTau5/goldenPath/bigZips/README.txt # (The htdocs/goldenPath/bosTau5/*/README.txt "files" are just links to # those.) # *** If you have to make any edits that would always apply to future # assemblies from the same sequencing center, please edit them into # ~/kent/src/hg/utils/automation/makeDownloads.pl (or ask Angie for # help). # vi /hive/data/genomes/bosTau5/goldenPath/database/README.txt # vi /hive/data/genomes/bosTau5/goldenPath/bigZips/README.txt # now ready for pushQ entry mkdir /hive/data/genomes/bosTau5/pushQ cd /hive/data/genomes/bosTau5/pushQ makePushQSql.pl bosTau5 > bosTau5.pushQ.sql 2> stderr.out # check for errors in stderr.out, some are OK, e.g.: # WARNING: hgwdev does not have /gbdb/bosTau5/wib/gc5Base.wib # WARNING: bosTau5 does not have seq # WARNING: bosTau5 does not have extFile # copy it to hgwbeta # use hgcat in .hg.conf.beta scp -p bosTau5.pushQ.sql hgwbeta:/tmp ssh hgwbeta cd /tmp hgsql qapushq < bosTau5.pushQ.sql # in that pushQ entry walk through each entry and see if the # sizes will set properly # reset to hguser in .hg.conf.beta ########################################################################