# for emacs: -*- mode: sh; -*- # Bos Taurus -- Baylor Release 4.2 (Apr 23 2009) ########################################################################## # Creating minimal build in order make liftover files between Btau4.2 and Btau4 # DOWNLOAD SEQUENCE (working - 2010-05-18 - Chin) # set up main genome directory ssh hgwdev cd /hive/data/genomes mkdir -p /hive/data/genomes/bosTau4P2/baylor/Contigs cd /hive/data/genomes/bosTau4P2/baylor/Contigs for F in README.Btau20080815.txt Contigs/* do wget --timestamping \ "ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Btaurus/fasta/Btau20080815/${F}" done mv README.Btau20080815.txt ../ mkdir LinearizedChromosomes cd LinearizedChromosomes wget --timestamping \ "ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Btaurus/fasta/Btau20080815/LinearizedChromosomes/*" # Looking up chrM in Entrez nucleotide search, # from bosTau3 note: NC_006853 GI:60101824 # search for Bos taurus mitochondrion complete genome # found it, and it had been update # Note, their Contigs files are from their 2006 release. # Their chrom files are new to this release. # fixup the Chr case names in the agp file: cd /hive/data/genomes/bosTau4P2/baylor sed -e "s/^Chr/chr/" Contigs/Btau4.2.agp > bosTau4P2.agp # fixup the contig names in the contigs.fa and qual files, utilize # the following perl scripts: # Change ">gi|112123023|gb|AAFC03055291.1| Bos taurus Ctg58.CH240-395G16, # whole genome shotgun sequence" line # to # ">AAFC03055291" cat << '_EOF_' > fixContigNames.pl #!/usr/bin/env perl use warnings; use strict; my $argc = scalar(@ARGV); if (1 != $argc) { print "usage ./fixContigNames.pl Btau20060815.contigs.fa.gz \\\n", " | gzip > bosTau4P2.contigs.fa.gz\n"; exit 255; } my $fName = shift; open (FH,"zcat $fName|") or die "can not read $fName $!"; while (my $line = ) { if ($line =~ m/^>/) { my @a = split('\|', $line); $a[3] =~ s/\.[0-9]+//; print ">$a[3]\n"; } else { print "$line"; } } close (FH) '_EOF_' # << happy emacs cat << '_EOF_' > fixQuals.pl #!/usr/bin/env perl use warnings; use strict; my $argc = scalar(@ARGV); if (1 != $argc) { print "usage ./fixQuals.pl Btau20060815.contigs.fa.qual.gz \\\n", " | gzip > bosTau4P2.qual.gz\n"; exit 255; } my $fName = shift; open (FH,"zcat $fName|") or die "can not read $fName $!"; while (my $line = ) { if ($line =~ m/^>/) { $line =~ s/\.[0-9]+.*//; } print "$line"; } close (FH) '_EOF_' # << happy emacs # There are extra records in the qual and contigs fa files. # Only the sequences in the agp file should be used: grep "^chr" bosTau4P2.agp | egrep -v "clone|fragment" | cut -f6 \ | sort > agp.names # run those scripts, extract only the used records, and lift the # qual scores via the AGP file to a qac file. chmod +x fixContigNames.pl fixQuals.pl XXXX 05-19 ./fixContigNames.pl LinearizedChromosomes/Btau_4.2_chr.fa.gz(Btau20060815.contigs.fa.gz??) \ | faSomeRecords stdin agp.names stdout | gzip > bosTau4P2.contigs.fa.gz ./fixQuals.pl Btau20060815.contigs.fa.qual.gz \ | faSomeRecords stdin agp.names stdout \ | qaToQac stdin stdout \ | qacAgpLift bosTau4P2.agp stdin bosTau4P2.qual.qac Read 0 qacs from stdin Got 11893 chroms in bosTau4P2.agp chr1 size=161108518 missing data: no quality scores for AAFC03067444.1 ############################################################################# # running makeGenomeDb.pl (DONE - 2008-03-07 - Hiram) ssh kkstore06 cd /hive/data/genomes/bosTau4P2 cat << '_EOF_' > bosTau4P2.config.ra db bosTau4P2 clade mammal scientificName Bos Taurus assemblyDate Oct. 2007 assemblyLabel Baylor Release 4.0 orderKey 236 dbDbSpeciesDir cow mitoAcc NC_006853 agpFiles /hive/data/genomes/bosTau4P2/baylor/bosTau4P2.agp fastaFiles /hive/data/genomes/bosTau4P2/baylor/bosTau4P2.contigs.fa.gz qualFiles /hive/data/genomes/bosTau4P2/baylor/bosTau4P2.qual.qac commonName Cow '_EOF_' # << happy emacs makeGenomeDb.pl bosTau4P2.config.ra > makeGenomeDb.log 2>&1 ######################################################################### # REPEATMASKER (DONE - 2008-03-07 - Hiram) ssh kkstore06 screen # use a screen to manage this job mkdir /hive/data/genomes/bosTau4P2/bed/repeatMasker cd /hive/data/genomes/bosTau4P2/bed/repeatMasker # # This was going too slow on memk, on the order of 8 days. # chilled that memk batch, then switched it to finish on pk doRepeatMasker.pl -buildDir=/hive/data/genomes/bosTau4P2/bed/repeatMasker \ -bigClusterHub=memk bosTau4P2 > do.log 2>&1 & # continuing doRepeatMasker.pl -buildDir=/hive/data/genomes/bosTau4P2/bed/repeatMasker \ -continue=cat -bigClusterHub=pk bosTau4P2 > cat.log 2>&1 & time nice -n +19 featureBits bosTau4P2 rmsk > fb.bosTau4P2.rmsk.txt 2>&1 & # 1280927549 bases of 2731830700 (46.889%) in intersection # RepeatMasker and lib version from do.log: # RepeatMasker,v 1.20 2008/01/16 18:15:45 angie Exp $ # Jan 11 2008 (open-3-1-9) version of RepeatMasker # CC RELEASE 20071204; # Compare coverage to previous assembly: featureBits bosTau3 rmsk # 1200525422 bases of 2731807384 (43.946%) in intersection featureBits bosTau2 rmsk # 1291561904 bases of 2812203870 (45.927%) in intersection ######################################################################### # SIMPLE REPEATS TRF (DONE - 2008-03-07 - Hiram) ssh kkstore06 screen # use a screen to manage this job mkdir /hive/data/genomes/bosTau4P2/bed/simpleRepeat cd /hive/data/genomes/bosTau4P2/bed/simpleRepeat # doSimpleRepeat.pl -buildDir=/hive/data/genomes/bosTau4P2/bed/simpleRepeat \ bosTau4P2 > do.log 2>&1 & # after RM run is done, add this mask: cd /hive/data/genomes/bosTau4P2 twoBitMask bosTau4P2.rmsk.2bit -add bed/simpleRepeat/trfMask.bed bosTau4P2.2bit twoBitToFa bosTau4P2.2bit stdout | faSize stdin # 2917974530 bases (186161294 N's 2731813236 real # 1450855409 upper 1280957827 lower) in 11900 sequences in 1 files # %43.90 masked total, %46.89 masked real twoBitToFa bosTau4P2.rmsk.2bit stdout | faSize stdin # 2917974530 bases (186161294 N's 2731813236 real # 1451369861 upper 1280443375 lower) in 11900 sequences in 1 files # %43.88 masked total, %46.87 masked real ######################################################################### # Create OOC file for genbank runs (DONE - 2008-03-10 - Hiram) # use same repMatch value as bosTau2 ssh kkstore06 cd /hive/data/genomes/bosTau3 blat bosTau4P2.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=bosTau4P2.11.ooc -repMatch=1005 ssh kkr1u00 mkdir /iscratch/i/bosTau4P2 cd /iscratch/i/bosTau4P2 cp -p /hive/data/genomes/bosTau4P2/bosTau4P2.2bit . for R in 2 3 4 5 6 7 8 do rsync -a --progress ./ kkr${R}u00:/iscratch/i/bosTau4P2/ done ######################################################################### ----------- 3Md stuff ------------------------------------------------------- # get sequence from UMD wget --timestamping -nd 'ftp://ftp.cbcb.umd.edu/pub/data/Bos_taurus/Bos_taurus_UMD_3.0/bos_taurus.agp' wget --timestamping -nd 'ftp://ftp.cbcb.umd.edu/pub/data/Bos_taurus/Bos_taurus_UMD_3.0/bos_taurus.fa.gz' # fixup ids for agp # change ">Chr" # to ">chr" mv bos_taurus.agp bos_taurus.orig.agp cat bos_taurus.orig.agp | sed 's/\(^Chr\)/chr/' > bos_taurus.agp # back to the main directory cd /hive/data/genomes/bosTau4P2 # Run automation to make the basic genome cat << '_EOF_' > bosTau4P2.config.ra # Config parameters for makeGenomeDb.pl: db bosTau4P2 clade mammal scientificName Bos Taurus assemblyDate Aug. 2009 assemblyLabel Univ. Maryland Release 3.0 orderKey 236 dbDbSpeciesDir cow mitoAcc 60101824 commonName Cow taxId 9913 fastaFiles /hive/data/genomes/bosTau4P2/download/bos_taurus.fa.gz agpFiles /hive/data/genomes/bosTau4P2/download/bos_taurus.agp subsetLittleIds Y '_EOF_' # << happy emacs time makeGenomeDb.pl bosTau4P2.config.ra > & makeGenomeDb.pl.out & # took 11 minutes #note: I added subsetLittleIds option to config.ra # because I had checked the .agp which was ok, # but the script complained that there were some extra # sequences in the fasta file. This option allows # the system to ignore those, as long as everything # in column 6 of agp is found in the fasta file(s). featureBits -countGaps bosTau4P2 gap #20737263 bases of 2660922743 (0.779%) in intersection cat chrom.sizes | awk '{sum+=$2;print sum,$0}' #2660922743 # similar total # TODO # Organism Image wget -O /usr/local/apache/htdocs/images/Aplysia_californica.jpg \ 'http://upload.wikimedia.org/wikipedia/commons/thumb/e/ef/Aplysia_californica.jpg/250px-Aplysia_californica.jpg' # TODO # Edit and check-in templates for description.html, gold.html, gap.html, bosTau4P2/trackDb.ra # repeat mask # for next time, put this under the bed/ dir mkdir repeatMasker cd repeatMasker time doRepeatMasker.pl bosTau4P2 -buildDir=`pwd` > & doRepeatMasker.pl.out & cat faSize.rmsk.txt #%47.84 masked total, %48.22 masked real featureBits -countGaps bosTau4P2 rmsk #1273525727 bases of 2660922743 (47.860%) in intersection # simple repeat masker trf cd /hive/data/genomes/bosTau4P2/bed mkdir simpleRepeat cd simpleRepeat time doSimpleRepeat.pl bosTau4P2 -buildDir=`pwd` > & doSimpleRepeat.pl.out & # failed only on chrM which doesn't matter # perhaps someone will look into why bigTrf fails on chrM ssh swarm cd /hive/data/genomes/bosTau4P2/bed/simpleRepeat/run.cluster /parasol/bin/para time > run.time time doSimpleRepeat.pl -continue filter -buildDir `pwd` bosTau4P2 \ >> & doSimpleRepeat.pl.out & featureBits -countGaps bosTau4P2 simpleRepeat #29450984 bases of 2660922743 (1.107%) in intersection # make final masked .2bit cd /hive/data/genomes/bosTau4P2 twoBitMask bosTau4P2.rmsk.2bit -add bed/simpleRepeat/trfMask.bed bosTau4P2.2bit #Warning: BED file bed/simpleRepeat/trfMask.bed has >=13 fields which means it #might contain block coordinates, but this program uses only the first three #fields # (the entire span -- no support for blocks). # this seems to be generic output we always see. ############################################################################ # prepare cluster data (DONE - 2009-11-20 - Galt) ssh hgwdev cd /hive/data/genomes/bosTau4P2 # create gbdb symlink ln -s `pwd`/bosTau4P2.2bit /gbdb/bosTau4P2/ time blat bosTau4P2.2bit /dev/null /dev/null -tileSize=11 -makeOoc=11.ooc -repMatch=1024 #Wrote 27298 overused 11-mers to 11.ooc #151.299u 3.434s 3:06.07 83.1% 0+0k 0+0io 3pf+0w mkdir /hive/data/staging/data/bosTau4P2 cp -p bosTau4P2.2bit /hive/data/staging/data/bosTau4P2 cp -p 11.ooc /hive/data/staging/data/bosTau4P2 cp -p chrom.sizes /hive/data/staging/data/bosTau4P2 # ask admin to sync this directory: /hive/data/staging/data/bosTau4P2/ # to the kluster nodes /scratch/data/bosTau4P2/ ############################################################################# # LIFTOVER TO bosTau4 (DONE - 2009-11-23 - Galt ) mkdir /hive/data/genomes/bosTau4P2/bed/blat.bosTau4.2009-11-23 cd /hive/data/genomes/bosTau4P2/bed/blat.bosTau4.2009-11-23 time nice +19 doSameSpeciesLiftOver.pl -verbose=2 \ -bigClusterHub=pk -dbHost=hgwdev -workhorse=hgwdev \ bosTau4P2 bosTau4 >& do.log # it actually ran out of space on /scratch/tmp (used over 16GB temporarily) # and I moved the temp dir somewhere else and # finished the net step. Then I did the load and cleanup steps. ################################################### #Change bosTau4P2 in hgcentraltest.dbDb to active=0: hgsql hgcentraltest update dbDb set active=0 where name='bosTau4P2'; # changed because liftover is confusing otherwise # when people are expecting successive versions # of the same assembly rather than alternate assemblies. update dbDb set description='Aug. 2009 (UMD3)' where name='bosTau4P2';