#!/usr/bin/env python import sys import time from tables import * import random import math import warnings import numpy # Initialize the random generator always with the same integer # in order to have reproductible results random.seed(19) numpy.random.seed(19) randomvalues = 0 worst=0 Small = { "var1": StringCol(itemsize=4, dflt="Hi!", pos=2), "var2": Int32Col(pos=1), "var3": Float64Col(pos=0), #"var4" : BoolCol(), } def createNewBenchFile(bfile, verbose): class Create(IsDescription): nrows = Int32Col(pos=0) irows = Int32Col(pos=1) tfill = Float64Col(pos=2) tidx = Float64Col(pos=3) tcfill = Float64Col(pos=4) tcidx = Float64Col(pos=5) rowsecf = Float64Col(pos=6) rowseci = Float64Col(pos=7) fsize = Float64Col(pos=8) isize = Float64Col(pos=9) psyco = BoolCol(pos=10) class Search(IsDescription): nrows = Int32Col(pos=0) rowsel = Int32Col(pos=1) time1 = Float64Col(pos=2) time2 = Float64Col(pos=3) tcpu1 = Float64Col(pos=4) tcpu2 = Float64Col(pos=5) rowsec1 = Float64Col(pos=6) rowsec2 = Float64Col(pos=7) psyco = BoolCol(pos=8) if verbose: print "Creating a new benchfile:", bfile # Open the benchmarking file bf = open_file(bfile, "w") # Create groups for recsize in ["small"]: group = bf.create_group("/", recsize, recsize+" Group") # Attach the row size of table as attribute if recsize == "small": group._v_attrs.rowsize = 16 # Create a Table for writing bench bf.create_table(group, "create_best", Create, "best case") bf.create_table(group, "create_worst", Create, "worst case") for case in ["best", "worst"]: # create a group for searching bench (best case) groupS = bf.create_group(group, "search_"+case, "Search Group") # Create Tables for searching for mode in ["indexed", "inkernel", "standard"]: groupM = bf.create_group(groupS, mode, mode+" Group") # for searching bench #for atom in ["string", "int", "float", "bool"]: for atom in ["string", "int", "float"]: bf.create_table(groupM, atom, Search, atom+" bench") bf.close() def createFile(filename, nrows, filters, index, heavy, noise, verbose): # Open a file in "w"rite mode fileh = open_file(filename, mode = "w", title="Searchsorted Benchmark", filters=filters) rowswritten = 0 # Create the test table table = fileh.create_table(fileh.root, 'table', Small, "test table", None, nrows) t1 = time.time() cpu1 = time.clock() nrowsbuf = table.nrowsinbuf minimum = 0 maximum = nrows for i in xrange(0, nrows, nrowsbuf): if i+nrowsbuf > nrows: j = nrows else: j = i+nrowsbuf if randomvalues: var3 = numpy.random.uniform(minimum, maximum, size=j-i) else: var3 = numpy.arange(i, j, dtype=numpy.float64) if noise > 0: var3 += numpy.random.uniform(-noise, noise, size=j-i) var2 = numpy.array(var3, dtype=numpy.int32) var1 = numpy.empty(shape=[j-i], dtype="S4") if not heavy: var1[:] = var2 table.append([var3, var2, var1]) table.flush() rowswritten += nrows time1 = time.time()-t1 tcpu1 = time.clock()-cpu1 print "Time for filling:", round(time1, 3),\ "Krows/s:", round(nrows/1000./time1, 3), fileh.close() size1 = os.stat(filename)[6] print ", File size:", round(size1/(1024.*1024.), 3), "MB" fileh = open_file(filename, mode = "a", title="Searchsorted Benchmark", filters=filters) table = fileh.root.table rowsize = table.rowsize if index: t1 = time.time() cpu1 = time.clock() # Index all entries if not heavy: indexrows = table.cols.var1.create_index(filters=filters) for colname in ['var2', 'var3']: table.colinstances[colname].create_index(filters=filters) time2 = time.time()-t1 tcpu2 = time.clock()-cpu1 print "Time for indexing:", round(time2, 3), \ "iKrows/s:", round(indexrows/1000./time2, 3), else: indexrows = 0 time2 = 0.0000000001 # an ugly hack tcpu2 = 0. if verbose: if index: idx = table.cols.var1.index print "Index parameters:", repr(idx) else: print "NOT indexing rows" # Close the file fileh.close() size2 = os.stat(filename)[6] - size1 if index: print ", Index size:", round(size2/(1024.*1024.), 3), "MB" return (rowswritten, indexrows, rowsize, time1, time2, tcpu1, tcpu2, size1, size2) def benchCreate(file, nrows, filters, index, bfile, heavy, psyco, noise, verbose): # Open the benchfile in append mode bf = open_file(bfile, "a") recsize = "small" if worst: table = bf.get_node("/"+recsize+"/create_worst") else: table = bf.get_node("/"+recsize+"/create_best") (rowsw, irows, rowsz, time1, time2, tcpu1, tcpu2, size1, size2) = \ createFile(file, nrows, filters, index, heavy, noise, verbose) # Collect data table.row["nrows"] = rowsw table.row["irows"] = irows table.row["tfill"] = time1 table.row["tidx"] = time2 table.row["tcfill"] = tcpu1 table.row["tcidx"] = tcpu2 table.row["fsize"] = size1 table.row["isize"] = size2 table.row["psyco"] = psyco tapprows = round(time1, 3) cpuapprows = round(tcpu1, 3) tpercent = int(round(cpuapprows/tapprows, 2)*100) print "Rows written:", rowsw, " Row size:", rowsz print "Time writing rows: %s s (real) %s s (cpu) %s%%" % \ (tapprows, cpuapprows, tpercent) rowsecf = rowsw / tapprows table.row["rowsecf"] = rowsecf #print "Write rows/sec: ", rowsecf print "Total file size:", round((size1+size2)/(1024.*1024.), 3), "MB", print ", Write KB/s (pure data):", int(rowsw * rowsz / (tapprows * 1024)) #print "Write KB/s :", int((size1+size2) / ((time1+time2) * 1024)) tidxrows = time2 cpuidxrows = round(tcpu2, 3) tpercent = int(round(cpuidxrows/tidxrows, 2)*100) print "Rows indexed:", irows, " (IMRows):", irows / float(10**6) print "Time indexing rows: %s s (real) %s s (cpu) %s%%" % \ (round(tidxrows, 3), cpuidxrows, tpercent) rowseci = irows / tidxrows table.row["rowseci"] = rowseci table.row.append() bf.close() def readFile(filename, atom, riter, indexmode, dselect, verbose): # Open the HDF5 file in read-only mode fileh = open_file(filename, mode = "r") table = fileh.root.table var1 = table.cols.var1 var2 = table.cols.var2 var3 = table.cols.var3 if indexmode == "indexed": if var2.index.nelements > 0: where = table._whereIndexed else: warnings.warn("Not indexed table or empty index. Defaulting to in-kernel selection") indexmode = "inkernel" where = table._whereInRange elif indexmode == "inkernel": where = table.where if verbose: print "Max rows in buf:", table.nrowsinbuf print "Rows in", table._v_pathname, ":", table.nrows print "Buffersize:", table.rowsize * table.nrowsinbuf print "MaxTuples:", table.nrowsinbuf if indexmode == "indexed": print "Chunk size:", var2.index.sorted.chunksize print "Number of elements per slice:", var2.index.nelemslice print "Slice number in", table._v_pathname, ":", var2.index.nrows #table.nrowsinbuf = 10 #print "nrowsinbuf-->", table.nrowsinbuf rowselected = 0 time2 = 0. tcpu2 = 0. results = [] print "Select mode:", indexmode, ". Selecting for type:", atom # Initialize the random generator always with the same integer # in order to have reproductible results on each read iteration random.seed(19) numpy.random.seed(19) for i in xrange(riter): # The interval for look values at. This is aproximately equivalent to # the number of elements to select rnd = numpy.random.randint(table.nrows) cpu1 = time.clock() t1 = time.time() if atom == "string": val = str(rnd)[-4:] if indexmode in ["indexed", "inkernel"]: results = [p.nrow for p in where('var1 == val')] else: results = [p.nrow for p in table if p["var1"] == val] elif atom == "int": val = rnd+dselect if indexmode in ["indexed", "inkernel"]: results = [p.nrow for p in where('(rnd <= var3) & (var3 < val)')] else: results = [p.nrow for p in table if rnd <= p["var2"] < val] elif atom == "float": val = rnd+dselect if indexmode in ["indexed", "inkernel"]: t1=time.time() results = [p.nrow for p in where('(rnd <= var3) & (var3 < val)')] else: results = [p.nrow for p in table if float(rnd) <= p["var3"] < float(val)] else: raise ValueError("Value for atom '%s' not supported." % atom) rowselected += len(results) #print "selected values-->", results if i == 0: # First iteration time1 = time.time() - t1 tcpu1 = time.clock() - cpu1 else: if indexmode == "indexed": # if indexed, wait until the 5th iteration (in order to # insure that the index is effectively cached) to take times if i >= 5: time2 += time.time() - t1 tcpu2 += time.clock() - cpu1 else: time2 += time.time() - t1 tcpu2 += time.clock() - cpu1 if riter > 1: if indexmode == "indexed" and riter >= 5: correction = 5 else: correction = 1 time2 = time2 / (riter - correction) tcpu2 = tcpu2 / (riter - correction) if verbose and 1: print "Values that fullfill the conditions:" print results #rowsread = table.nrows * riter rowsread = table.nrows rowsize = table.rowsize # Close the file fileh.close() return (rowsread, rowselected, rowsize, time1, time2, tcpu1, tcpu2) def benchSearch(file, riter, indexmode, bfile, heavy, psyco, dselect, verbose): # Open the benchfile in append mode bf = open_file(bfile, "a") recsize = "small" if worst: tableparent = "/"+recsize+"/search_worst/"+indexmode+"/" else: tableparent = "/"+recsize+"/search_best/"+indexmode+"/" # Do the benchmarks if not heavy: #atomlist = ["string", "int", "float", "bool"] atomlist = ["string", "int", "float"] else: #atomlist = ["int", "float", "bool"] atomlist = ["int", "float"] for atom in atomlist: tablepath = tableparent + atom table = bf.get_node(tablepath) (rowsr, rowsel, rowssz, time1, time2, tcpu1, tcpu2) = \ readFile(file, atom, riter, indexmode, dselect, verbose) row = table.row row["nrows"] = rowsr row["rowsel"] = rowsel treadrows = round(time1, 6) row["time1"] = time1 treadrows2 = round(time2, 6) row["time2"] = time2 cpureadrows = round(tcpu1, 6) row["tcpu1"] = tcpu1 cpureadrows2 = round(tcpu2, 6) row["tcpu2"] = tcpu2 row["psyco"] = psyco tpercent = int(round(cpureadrows/treadrows, 2)*100) if riter > 1: tpercent2 = int(round(cpureadrows2/treadrows2, 2)*100) else: tpercent2 = 0. tMrows = rowsr / (1000*1000.) sKrows = rowsel / 1000. if atom == "string": # just to print once print "Rows read:", rowsr, "Mread:", round(tMrows, 6), "Mrows" print "Rows selected:", rowsel, "Ksel:", round(sKrows, 6), "Krows" print "Time selecting (1st time): %s s (real) %s s (cpu) %s%%" % \ (treadrows, cpureadrows, tpercent) if riter > 1: print "Time selecting (cached): %s s (real) %s s (cpu) %s%%" % \ (treadrows2, cpureadrows2, tpercent2) #rowsec1 = round(rowsr / float(treadrows), 6)/10**6 rowsec1 = rowsr / treadrows row["rowsec1"] = rowsec1 print "Read Mrows/sec: ", print round(rowsec1 / 10.**6, 6), "(first time)", if riter > 1: rowsec2 = rowsr / treadrows2 row["rowsec2"] = rowsec2 print round(rowsec2 / 10.**6, 6), "(cache time)" else: print # Append the info to the table row.append() table.flush() # Close the benchmark file bf.close() if __name__=="__main__": import sys import os.path import getopt try: import psyco psyco_imported = 1 except: psyco_imported = 0 import time usage = """usage: %s [-v] [-p] [-R] [-r] [-w] [-c level] [-l complib] [-S] [-F] [-n nrows] [-x] [-b file] [-t] [-h] [-k riter] [-m indexmode] [-N range] [-d range] datafile -v verbose -p use "psyco" if available -R use Random values for filling -r only read test -w only write test -c sets a compression level (do not set it or 0 for no compression) -l sets the compression library ("zlib", "lzo", "ucl", "bzip2" or "none") -S activate shuffling filter -F activate fletcher32 filter -n set the number of rows in tables (in krows) -x don't make indexes -b bench filename -t worsT searching case -h heavy benchmark (operations without strings) -m index mode for reading ("indexed" | "inkernel" | "standard") -N introduce (uniform) noise within range into the values -d the interval for look values (int, float) at. Default is 3. -k number of iterations for reading\n""" % sys.argv[0] try: opts, pargs = getopt.getopt(sys.argv[1:], 'vpSFRrowxthk:b:c:l:n:m:N:d:') except: sys.stderr.write(usage) sys.exit(0) # if we pass too much parameters, abort if len(pargs) != 1: sys.stderr.write(usage) sys.exit(0) # default options dselect = 3. noise = 0. verbose = 0 fieldName = None testread = 1 testwrite = 1 usepsyco = 0 complevel = 0 shuffle = 0 fletcher32 = 0 complib = "zlib" nrows = 1000 index = 1 heavy = 0 bfile = "bench.h5" supported_imodes = ["indexed", "inkernel", "standard"] indexmode = "inkernel" riter = 1 # Get the options for option in opts: if option[0] == '-v': verbose = 1 if option[0] == '-p': usepsyco = 1 if option[0] == '-R': randomvalues = 1 if option[0] == '-S': shuffle = 1 if option[0] == '-F': fletcher32 = 1 elif option[0] == '-r': testwrite = 0 elif option[0] == '-w': testread = 0 elif option[0] == '-x': index = 0 elif option[0] == '-h': heavy = 1 elif option[0] == '-t': worst = 1 elif option[0] == '-b': bfile = option[1] elif option[0] == '-c': complevel = int(option[1]) elif option[0] == '-l': complib = option[1] elif option[0] == '-m': indexmode = option[1] if indexmode not in supported_imodes: raise ValueError("Indexmode should be any of '%s' and you passed '%s'" % (supported_imodes, indexmode)) elif option[0] == '-n': nrows = int(float(option[1])*1000) elif option[0] == '-N': noise = float(option[1]) elif option[0] == '-d': dselect = float(option[1]) elif option[0] == '-k': riter = int(option[1]) if worst: nrows -= 1 # the worst case if complib == "none": # This means no compression at all complib="zlib" # just to make PyTables not complaining complevel=0 # Catch the hdf5 file passed as the last argument file = pargs[0] # Build the Filters instance filters = Filters(complevel=complevel, complib=complib, shuffle=shuffle, fletcher32=fletcher32) # Create the benchfile (if needed) if not os.path.exists(bfile): createNewBenchFile(bfile, verbose) if testwrite: if verbose: print "Compression level:", complevel if complevel > 0: print "Compression library:", complib if shuffle: print "Suffling..." if psyco_imported and usepsyco: psyco.bind(createFile) benchCreate(file, nrows, filters, index, bfile, heavy, usepsyco, noise, verbose) if testread: if psyco_imported and usepsyco: psyco.bind(readFile) benchSearch(file, riter, indexmode, bfile, heavy, usepsyco, dselect, verbose)