"""Small benchmark on the effect of chunksizes and compression on HDF5 files. Francesc Alted 2007-11-25 """ import os, math, subprocess, tempfile from time import time import numpy import tables # Size of dataset #N, M = 512, 2**16 # 256 MB #N, M = 512, 2**18 # 1 GB #N, M = 512, 2**19 # 2 GB N, M = 2000, 1000000 # 15 GB #N, M = 4000, 1000000 # 30 GB datom = tables.Float64Atom() # elements are double precision def quantize(data, least_significant_digit): """quantize data to improve compression. data is quantized using around(scale*data)/scale, where scale is 2**bits, and bits is determined from the least_significant_digit. For example, if least_significant_digit=1, bits will be 4.""" precision = 10.**-least_significant_digit exp = math.log(precision, 10) if exp < 0: exp = int(math.floor(exp)) else: exp = int(math.ceil(exp)) bits = math.ceil(math.log(10.**-exp, 2)) scale = 2.**bits return numpy.around(scale*data)/scale def get_db_size(filename): sout = subprocess.Popen("ls -sh %s" % filename, shell=True, stdout=subprocess.PIPE).stdout line = [l for l in sout][0] return line.split()[0] def bench(chunkshape, filters): numpy.random.seed(1) # to have reproductible results filename = tempfile.mktemp(suffix='.h5') print "Doing test on the file system represented by:", filename f = tables.open_file(filename, 'w') e = f.create_earray(f.root, 'earray', datom, shape=(0, M), filters = filters, chunkshape = chunkshape) # Fill the array t1 = time() for i in xrange(N): #e.append([numpy.random.rand(M)]) # use this for less compressibility e.append([quantize(numpy.random.rand(M), 6)]) #os.system("sync") print "Creation time:", round(time()-t1, 3), filesize = get_db_size(filename) filesize_bytes = os.stat(filename)[6] print "\t\tFile size: %d -- (%s)" % (filesize_bytes, filesize) # Read in sequential mode: e = f.root.earray t1 = time() # Flush everything to disk and flush caches #os.system("sync; echo 1 > /proc/sys/vm/drop_caches") for row in e: t = row print "Sequential read time:", round(time()-t1, 3), #f.close() #return # Read in random mode: i_index = numpy.random.randint(0, N, 128) j_index = numpy.random.randint(0, M, 256) # Flush everything to disk and flush caches #os.system("sync; echo 1 > /proc/sys/vm/drop_caches") # Protection against too large chunksizes if 0 and filters.complevel and chunkshape[0]*chunkshape[1]*8 > 2**22: # 4 MB f.close() return t1 = time() for i in i_index: for j in j_index: t = e[i, j] print "\tRandom read time:", round(time()-t1, 3) f.close() # Benchmark with different chunksizes and filters #for complevel in (0, 1, 3, 6, 9): for complib in (None, 'zlib', 'lzo', 'blosc'): #for complib in ('blosc',): if complib: filters = tables.Filters(complevel=5, complib=complib) else: filters = tables.Filters(complevel=0) print "8<--"*20, "\nFilters:", filters, "\n"+"-"*80 #for ecs in (11, 14, 17, 20, 21, 22): for ecs in range(10, 24): #for ecs in (19,): chunksize = 2**ecs chunk1 = 1 chunk2 = chunksize/datom.itemsize if chunk2 > M: chunk1 = chunk2 / M chunk2 = M chunkshape = (chunk1, chunk2) cs_str = str(chunksize / 1024) + " KB" print "***** Chunksize:", cs_str, "/ Chunkshape:", chunkshape, "*****" bench(chunkshape, filters)