# for emacs: -*- mode: sh; -*- # Oreochromis niloticus - Nile Tilapia # ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_other/Oreochromis_niloticus # http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AERX01 # # DATE: 31-Jan-2011 # ORGANISM: Oreochromis niloticus # TAXID: 8128 # ASSEMBLY LONG NAME: Orenil1.0 # ASSEMBLY SHORT NAME: Orenil1.0 # ASSEMBLY SUBMITTER: Broad Institute of MIT and Harvard # ASSEMBLY TYPE: Haploid # NUMBER OF ASSEMBLY-UNITS: 1 # Assembly Accession: GCA_000188235.1 ############################################################################# # Download sequence from NCBI (DONE - 2011-04-20 - Hiram) mkdir -p /hive/data/genomes/oreNil1/genbank cd /hive/data/genomes/oreNil1/genbank wget --timestamping -r --cut-dirs=6 --level=0 -nH -x \ --no-remove-listing -np \ "ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_other/Oreochromis_niloticus/Orenil1.0/*" faSize Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz # 927725912 bases (111657865 N's 816068047 real 816068047 upper 0 lower) # in 5900 sequences in 1 files zcat Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz \ | grep "^#" > ucsc.agp # remove the .1 or .2 accession versions from the scaffold/contig names # our database tables can not function with periods in the scaffold names zcat Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz \ | grep -v "^#" | sed -e "s/\.[12]//" >> ucsc.agp gzip ucsc.agp # extract the scaffold/contig name and remove the .1 or .2 accession version zcat Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz \ | sed -e "s/^>.*GL83/>GL83/; s/^>.*AERX01/>AERX01/; s/\.[12].*//" \ > ucsc.fa gzip ucsc.fa # verify nothing lost in the translation, should be same numbers as above faSize ucsc.fa.gz # 927725912 bases (111657865 N's 816068047 real 816068047 upper 0 lower) in 5900 sequences in 1 files # verify the names are OK zcat ucsc.fa.gz | grep "^>" | sort | head zcat ucsc.fa.gz | grep "^>" | sort | tail ############################################################################# # Initial database build (DONE - 2011-09-08 - Hiram) cd /hive/data/genomes/oreNil1 cat << '_EOF_' > oreNil1.config.ra # Config parameters for makeGenomeDb.pl: db oreNil1 clade vertebrate genomeCladePriority 85 scientificName Oreochromis niloticus commonName Nile tilapia assemblyDate Jan. 2011 assemblyLabel Broad Institute of MIT and Harvard Orenil1.0 (GCA_000188235.1) assemblyShortLabel Nile tilapia orderKey 468 mitoAcc NC_013663 fastaFiles /cluster/data/oreNil1/genbank/ucsc.fa.gz agpFiles /cluster/data/oreNil1/genbank/ucsc.agp dbDbSpeciesDir oreNil taxId 8128 '_EOF_' # << happy emacs # verify sequence and agp are OK time makeGenomeDb.pl -stop=agp oreNil1.config.ra > agp.log 2>&1 # real 1m26.563s time makeGenomeDb.pl -continue=db oreNil1.config.ra > db.log 2>&1 # about 9 minutes ########################################################################## # running repeat masker (DONE - 2011-11-30 - Hiram) mkdir /hive/data/genomes/oreNil1/bed/repeatMasker cd /hive/data/genomes/oreNil1/bed/repeatMasker time doRepeatMasker.pl -buildDir=`pwd` -noSplit \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ -smallClusterHub=memk oreNil1 > do.log 2>&1 & # real 73m53.508s XXX - running - Wed Nov 30 09:27:18 PST 2011 # real 35m31.331s cat faSize.rmsk.txt # 927742539 bases (111657865 N's 816084674 real 786293778 upper # 29790896 lower) in 5901 sequences in 1 files # %3.21 masked total, %3.65 masked real egrep -i "versi|relea" do.log # April 26 2011 (open-3-3-0) version of RepeatMasker # CC RELEASE 20110920; # RepeatMasker version development-$Id: RepeatMasker,v 1.26 2011/09/26 16:19:44 angie Exp $ featureBits -countGaps oreNil1 rmsk # 30136368 bases of 927742539 (3.248%) in intersection # why is it different than the faSize above ? # because rmsk masks out some N's as well as bases, the count above # separates out the N's from the bases, it doesn't show lower case N's # rmsk is masking N's sometimes: featureBits oreNil1 -not gap -bed=notGap.bed # 816084674 bases of 816084674 (100.000%) in intersection featureBits oreNil1 rmsk notGap.bed -bed=stdout \ | gzip -c > cleanRmsk.bed.gz # 29790896 bases of 816084674 (3.650%) in intersection # rmsk bases: 30136368 minus non-N bases 29790896 == N's masked: 345472 ########################################################################## # running simple repeat (DONE - 2011-11-30 - Hiram) mkdir /hive/data/genomes/oreNil1/bed/simpleRepeat cd /hive/data/genomes/oreNil1/bed/simpleRepeat time doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=swarm \ -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=memk \ oreNil1 > do.log 2>&1 & # real 11m55.138s cat fb.simpleRepeat # 7954018 bases of 816084674 (0.975%) in intersection # add to rmsk after it is done: cd /hive/data/genomes/oreNil1 twoBitMask oreNil1.rmsk.2bit \ -add bed/simpleRepeat/trfMask.bed oreNil1.2bit # you can safely ignore the warning about fields >= 13 twoBitToFa oreNil1.2bit stdout | faSize stdin > faSize.oreNil1.2bit.txt cat faSize.oreNil1.2bit.txt # 1511735326 bases (153400444 N's 1358334882 real 1024824487 upper # 333510395 lower) in 19550 sequences in 1 files # %22.06 masked total, %24.55 masked real rm /gbdb/oreNil1/oreNil1.2bit ln -s `pwd`/oreNil1.2bit /gbdb/oreNil1/oreNil1.2bit ######################################################################### # Verify all gaps are marked, add any N's not in gap as type 'other' # (DONE - 2011-11-30 - Hiram) mkdir /hive/data/genomes/oreNil1/bed/gap cd /hive/data/genomes/oreNil1/bed/gap time nice -n +19 findMotif -motif=gattaca -verbose=4 \ -strand=+ ../../oreNil1.unmasked.2bit > findMotif.txt 2>&1 # real 0m9.711s grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed featureBits oreNil1 -not gap -bed=notGap.bed # 816084674 bases of 816084674 (100.000%) in intersection featureBits oreNil1 allGaps.bed notGap.bed -bed=new.gaps.bed # 0 bases of 816084674 (0.000%) in intersection # No new gaps to mark. To see the complete sequence here if there are # some, look in xenTro3.txt ########################################################################## ## WINDOWMASKER (DONE - 2011-11-30 - Hiram) mkdir /hive/data/genomes/oreNil1/bed/windowMasker cd /hive/data/genomes/oreNil1/bed/windowMasker time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \ -dbHost=hgwdev oreNil1 > do.log 2>&1 & # real 55m3.404s # Masking statistics twoBitToFa oreNil1.wmsk.2bit stdout | faSize stdin # 927742539 bases (111657865 N's 816084674 real 612807423 upper # 203277251 lower) in 5901 sequences in 1 files # %21.91 masked total, %24.91 masked real twoBitToFa oreNil1.wmsk.sdust.2bit stdout | faSize stdin # 927742539 bases (111657865 N's 816084674 real 608110777 upper # 207973897 lower) in 5901 sequences in 1 files # %22.42 masked total, %25.48 masked real hgLoadBed oreNil1 windowmaskerSdust windowmasker.sdust.bed.gz # Loaded 5156979 elements of size 3 featureBits -countGaps oreNil1 windowmaskerSdust # 319631762 bases of 927742539 (34.453%) in intersection # eliminate the gaps from the masking featureBits oreNil1 -not gap -bed=notGap.bed # 816084674 bases of 816084674 (100.000%) in intersection time nice -n +19 featureBits oreNil1 windowmaskerSdust notGap.bed \ -bed=stdout | gzip -c > cleanWMask.bed.gz # 207973897 bases of 816084674 (25.484%) in intersection # real 1m44.710s # reload track to get it clean hgLoadBed oreNil1 windowmaskerSdust cleanWMask.bed.gz # Loaded 5181457 elements of size 4 featureBits -countGaps oreNil1 windowmaskerSdust # 207973897 bases of 927742539 (22.417%) in intersection zcat cleanWMask.bed.gz \ | twoBitMask ../../oreNil1.unmasked.2bit stdin \ -type=.bed oreNil1.cleanWMSdust.2bit twoBitToFa oreNil1.cleanWMSdust.2bit stdout | faSize stdin \ > oreNil1.cleanWMSdust.faSize.txt cat oreNil1.cleanWMSdust.faSize.txt # 927742539 bases (111657865 N's 816084674 real 608110777 upper # 207973897 lower) in 5901 sequences in 1 files # %22.42 masked total, %25.48 masked real # how much does this window masker and repeat masker overlap: featureBits -countGaps oreNil1 rmsk windowmaskerSdust # 21441295 bases of 927742539 (2.311%) in intersection ######################################################################### # MASK SEQUENCE WITH WM+TRF (DONE - 2011-11-30 - Hiram) # since rmsk masks so very little of the genome, use the clean WM mask # on the genome sequence cd /hive/data/genomes/oreNil1 twoBitMask -add bed/windowMasker/oreNil1.cleanWMSdust.2bit \ bed/simpleRepeat/trfMask.bed oreNil1.2bit # safe to ignore the warnings about BED file with >=13 fields twoBitToFa oreNil1.2bit stdout | faSize stdin > faSize.oreNil1.txt cat faSize.oreNil1.txt # 927742539 bases (111657865 N's 816084674 real 607935881 upper # 208148793 lower) in 5901 sequences in 1 files # %22.44 masked total, %25.51 masked real # create symlink to gbdb ssh hgwdev rm /gbdb/oreNil1/oreNil1.2bit ln -s `pwd`/oreNil1.2bit /gbdb/oreNil1/oreNil1.2bit ######################################################################## # MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2011-11-30 - Hiram) # Use -repMatch=650, based on size -- for human we use 1024 # use the "real" number from the faSize measurement, # hg19 is 2897316137, calculate the ratio factor for 1024: calc \( 816084674 / 2897316137 \) \* 1024 # ( 816084674 / 2897316137 ) * 1024 = 288.429245 # round up to 300 cd /hive/data/genomes/oreNil1 blat oreNil1.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=jkStuff/oreNil1.11.ooc -repMatch=300 # Wrote 16716 overused 11-mers to jkStuff/oreNil1.11.ooc # copy all of this stuff to the klusters: # there are no non-bridged gaps hgsql -N -e "select bridge from gap;" oreNil1 | sort | uniq -c # 71854 yes # If there were: # cd /hive/data/genomes/oreNil1/jkStuff # gapToLift oreNil1 nonBridged.lift -bedFile=nonBridged.bed cd /hive/data/genomes/oreNil1 mkdir /hive/data/staging/data/oreNil1 cp -p jkStuff/oreNil1.11.ooc chrom.sizes \ oreNil1.2bit /hive/data/staging/data/oreNil1 # request rsync copy from cluster admin ######################################################################### # BLATSERVERS ENTRY (DONE - 2011-12-02 - Hiram) # After getting a blat server assigned by the Blat Server Gods, ssh hgwdev hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("oreNil1", "blat1", "17816", "1", "0"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("oreNil1", "blat1", "17817", "0", "1");' \ hgcentraltest # test it with some sequence ############################################################################ # set default position on the RHOA gene sequence (via blat) # (DONE - 2011-12-02 - Hiram) hgsql -e \ 'update dbDb set defaultPos="GL831146:690296-694012" where name="oreNil1";' \ hgcentraltest ############################################################################ # cpgIslands - (DONE - 2011-04-20 - Hiram) mkdir /hive/data/genomes/oreNil1/bed/cpgIslands cd /hive/data/genomes/oreNil1/bed/cpgIslands time doCpgIslands.pl oreNil1 > do.log 2>&1 # Elapsed time: 11m10s time featureBits oreNil1 cpgIslandExt > fb.oreNil1.cpgIslandExt.txt 2>&1 # 6076152 bases of 816084674 (0.745%) in intersection # real 0m2.342s ######################################################################### # genscan - (DONE - 2011-04-26 - Hiram) mkdir /hive/data/genomes/oreNil1/bed/genscan cd /hive/data/genomes/oreNil1/bed/genscan time doGenscan.pl oreNil1 > do.log 2>&1 # real 15m43.924s cat fb.oreNil1.genscan.txt # 38174535 bases of 816084674 (4.678%) in intersection cat fb.oreNil1.genscanSubopt.txt # 30455436 bases of 816084674 (3.732%) in intersection #########################################################################