#!/usr/bin/python import sqlite import random import time import sys import os import os.path from tables import * import numarray from numarray import strings from numarray import random_array randomvalues = 0 standarddeviation = 10000 # Initialize the random generator always with the same integer # in order to have reproductible results random.seed(19) random_array.seed(19, 20) # defaults psycon = 0 worst = 0 def createNewBenchFile(bfile, verbose): class Create(IsDescription): nrows = Int32Col(pos=0) irows = Int32Col(pos=1) tfill = Float64Col(pos=2) tidx = Float64Col(pos=3) tcfill = Float64Col(pos=4) tcidx = Float64Col(pos=5) rowsecf = Float64Col(pos=6) rowseci = Float64Col(pos=7) fsize = Float64Col(pos=8) isize = Float64Col(pos=9) psyco = BoolCol(pos=10) class Search(IsDescription): nrows = Int32Col(pos=0) rowsel = Int32Col(pos=1) time1 = Float64Col(pos=2) time2 = Float64Col(pos=3) tcpu1 = Float64Col(pos=4) tcpu2 = Float64Col(pos=5) rowsec1 = Float64Col(pos=6) rowsec2 = Float64Col(pos=7) psyco = BoolCol(pos=8) if verbose: print "Creating a new benchfile:", bfile # Open the benchmarking file bf = open_file(bfile, "w") # Create groups for recsize in ["sqlite_small"]: group = bf.create_group("/", recsize, recsize+" Group") # Attach the row size of table as attribute if recsize == "small": group._v_attrs.rowsize = 16 # Create a Table for writing bench bf.create_table(group, "create_indexed", Create, "indexed values") bf.create_table(group, "create_standard", Create, "standard values") # create a group for searching bench groupS = bf.create_group(group, "search", "Search Group") # Create Tables for searching for mode in ["indexed", "standard"]: group = bf.create_group(groupS, mode, mode+" Group") # for searching bench #for atom in ["string", "int", "float", "bool"]: for atom in ["string", "int", "float"]: bf.create_table(group, atom, Search, atom+" bench") bf.close() def createFile(filename, nrows, filters, indexmode, heavy, noise, bfile, verbose): # Initialize some variables t1 = 0.; t2 = 0. tcpu1 = 0.; tcpu2 = 0. rowsecf = 0.; rowseci = 0. size1 = 0.; size2 = 0. if indexmode == "standard": print "Creating a new database:", dbfile instd=os.popen("/usr/local/bin/sqlite "+dbfile, "w") CREATESTD=""" CREATE TABLE small ( -- Name Type -- Example --------------------------------------- recnum INTEGER PRIMARY KEY, -- 345 var1 char(4), -- Abronia villosa var2 INTEGER, -- 111 var3 FLOAT -- 12.32 ); """ CREATEIDX=""" CREATE TABLE small ( -- Name Type -- Example --------------------------------------- recnum INTEGER PRIMARY KEY, -- 345 var1 char(4), -- Abronia villosa var2 INTEGER, -- 111 var3 FLOAT -- 12.32 ); CREATE INDEX ivar1 ON small(var1); CREATE INDEX ivar2 ON small(var2); CREATE INDEX ivar3 ON small(var3); """ # Creating the table first and indexing afterwards is a bit faster instd.write(CREATESTD) instd.close() conn = sqlite.connect(dbfile) cursor = conn.cursor() if indexmode == "standard": place_holders = ",".join(['%s']*3) # Insert rows SQL = "insert into small values(NULL, %s)" % place_holders time1 = time.time() cpu1 = time.clock() # This way of filling is to copy the PyTables benchmark nrowsbuf = 1000 minimum = 0 maximum = nrows for i in xrange(0, nrows, nrowsbuf): if i+nrowsbuf > nrows: j = nrows else: j = i+nrowsbuf if randomvalues: var3 = random_array.uniform(minimum, maximum, shape=[j-i]) else: var3 = numarray.arange(i, j, type=numarray.Float64) if noise: var3 += random_array.uniform(-3, 3, shape=[j-i]) var2 = numarray.array(var3, type=numarray.Int32) var1 = strings.array(None, shape=[j-i], itemsize=4) if not heavy: for n in xrange(j-i): var1[n] = str("%.4s" % var2[n]) for n in xrange(j-i): fields = (var1[n], var2[n], var3[n]) cursor.execute(SQL, fields) conn.commit() t1 = round(time.time()-time1, 5) tcpu1 = round(time.clock()-cpu1, 5) rowsecf = nrows/t1 size1 = os.stat(dbfile)[6] print "******** Results for writing nrows = %s" % (nrows), "*********" print "Insert time:", t1, ", KRows/s:", round((nrows/10.**3)/t1, 3), print ", File size:", round(size1/(1024.*1024.), 3), "MB" # Indexem if indexmode == "indexed": time1 = time.time() cpu1 = time.clock() if not heavy: cursor.execute("CREATE INDEX ivar1 ON small(var1)") conn.commit() cursor.execute("CREATE INDEX ivar2 ON small(var2)") conn.commit() cursor.execute("CREATE INDEX ivar3 ON small(var3)") conn.commit() t2 = round(time.time()-time1, 5) tcpu2 = round(time.clock()-cpu1, 5) rowseci = nrows/t2 print "Index time:", t2, ", IKRows/s:", round((nrows/10.**3)/t2, 3), size2 = os.stat(dbfile)[6] - size1 print ", Final size with index:", round(size2/(1024.*1024), 3), "MB" conn.close() # Collect benchmark data bf = open_file(bfile, "a") recsize = "sqlite_small" if indexmode == "indexed": table = bf.get_node("/"+recsize+"/create_indexed") else: table = bf.get_node("/"+recsize+"/create_standard") table.row["nrows"] = nrows table.row["irows"] = nrows table.row["tfill"] = t1 table.row["tidx"] = t2 table.row["tcfill"] = tcpu1 table.row["tcidx"] = tcpu2 table.row["psyco"] = psycon table.row["rowsecf"] = rowsecf table.row["rowseci"] = rowseci table.row["fsize"] = size1 table.row["isize"] = size2 table.row.append() bf.close() return def readFile(dbfile, nrows, indexmode, heavy, dselect, bfile, riter): # Connect to the database. conn = sqlite.connect(db=dbfile, mode=755) # Obtain a cursor cursor = conn.cursor() # select count(*), avg(var2) SQL1 = """ select recnum from small where var1 = %s """ SQL2 = """ select recnum from small where var2 >= %s and var2 < %s """ SQL3 = """ select recnum from small where var3 >= %s and var3 < %s """ # Open the benchmark database bf = open_file(bfile, "a") #default values for the case that columns are not indexed t2 = 0. tcpu2 = 0. # Some previous computations for the case of random values if randomvalues: # algorithm to choose a value separated from mean # # If want to select fewer values, select this # if nrows/2 > standarddeviation*3: # # Choose five standard deviations away from mean value # dev = standarddeviation*5 # #dev = standarddeviation*math.log10(nrows/1000.) # This algorithm give place to too asymmetric result values # if standarddeviation*10 < nrows/2: # # Choose four standard deviations away from mean value # dev = standarddeviation*4 # else: # dev = 100 # Yet Another Algorithm if nrows/2 > standarddeviation*10: dev = standarddeviation*4. elif nrows/2 > standarddeviation: dev = standarddeviation*2. elif nrows/2 > standarddeviation/10.: dev = standarddeviation/10. else: dev = standarddeviation/100. valmax = int(round((nrows/2.)-dev)) # split the selection range in regular chunks if riter > valmax*2: riter = valmax*2 chunksize = (valmax*2/riter)*10 # Get a list of integers for the intervals randlist = range(0, valmax, chunksize) randlist.extend(range(nrows-valmax, nrows, chunksize)) # expand the list ten times so as to use the cache randlist = randlist*10 # shuffle the list random.shuffle(randlist) # reset the value of chunksize chunksize = chunksize/10 #print "chunksize-->", chunksize #randlist.sort();print "randlist-->", randlist else: chunksize = 3 if heavy: searchmodelist = ["int", "float"] else: searchmodelist = ["string", "int", "float"] # Execute queries for atom in searchmodelist: time2 = 0 cpu2 = 0 rowsel = 0 for i in xrange(riter): rnd = random.randrange(nrows) time1 = time.time() cpu1 = time.clock() if atom == "string": #cursor.execute(SQL1, "1111") cursor.execute(SQL1, str(rnd)[-4:]) elif atom == "int": #cursor.execute(SQL2 % (rnd, rnd+3)) cursor.execute(SQL2 % (rnd, rnd+dselect)) elif atom == "float": #cursor.execute(SQL3 % (float(rnd), float(rnd+3))) cursor.execute(SQL3 % (float(rnd), float(rnd+dselect))) else: raise ValueError("atom must take a value in ['string','int','float']") if i == 0: t1 = time.time() - time1 tcpu1 = time.clock() - cpu1 else: if indexmode == "indexed": # if indexed, wait until the 5th iteration to take # times (so as to insure that the index is # effectively cached) if i >= 5: time2 += time.time() - time1 cpu2 += time.clock() - cpu1 else: time2 += time.time() - time1 time2 += time.clock() - cpu1 if riter > 1: if indexmode == "indexed" and riter >= 5: correction = 5 else: correction = 1 t2 = time2/(riter-correction) tcpu2 = cpu2/(riter-correction) print "*** Query results for atom = %s, nrows = %s, indexmode = %s ***" % (atom, nrows, indexmode) print "Query time:", round(t1, 5), ", cached time:", round(t2, 5) print "MRows/s:", round((nrows/10.**6)/t1, 3), if t2 > 0: print ", cached MRows/s:", round((nrows/10.**6)/t2, 3) else: print # Collect benchmark data recsize = "sqlite_small" tablepath = "/"+recsize+"/search/"+indexmode+"/"+atom table = bf.get_node(tablepath) table.row["nrows"] = nrows table.row["rowsel"] = rowsel table.row["time1"] = t1 table.row["time2"] = t2 table.row["tcpu1"] = tcpu1 table.row["tcpu2"] = tcpu2 table.row["psyco"] = psycon table.row["rowsec1"] = nrows/t1 if t2 > 0: table.row["rowsec2"] = nrows/t2 table.row.append() table.flush() # Flush the data # Close the database conn.close() bf.close() # the bench database return if __name__=="__main__": import getopt try: import psyco psyco_imported = 1 except: psyco_imported = 0 usage = """usage: %s [-v] [-p] [-R] [-h] [-t] [-r] [-w] [-n nrows] [-b file] [-k riter] [-m indexmode] [-N range] datafile -v verbose -p use "psyco" if available -R use Random values for filling -h heavy mode (exclude strings from timings) -t worsT searching case (to emulate PyTables worst cases) -r only read test -w only write test -n the number of rows (in krows) -b bench filename -N introduce (uniform) noise within range into the values -d the interval for look values (int, float) at. Default is 3. -k number of iterations for reading\n""" % sys.argv[0] try: opts, pargs = getopt.getopt(sys.argv[1:], 'vpRhtrwn:b:k:m:N:d:') except: sys.stderr.write(usage) sys.exit(0) # if we pass too much parameters, abort if len(pargs) != 1: sys.stderr.write(usage) sys.exit(0) # default options dselect = 3. noise = 0. verbose = 0 heavy = 0 testread = 1 testwrite = 1 usepsyco = 0 nrows = 1000 bfile = "sqlite-bench.h5" supported_imodes = ["indexed", "standard"] indexmode = "indexed" riter = 2 # Get the options for option in opts: if option[0] == '-v': verbose = 1 if option[0] == '-p': usepsyco = 1 elif option[0] == '-R': randomvalues = 1 elif option[0] == '-h': heavy = 1 elif option[0] == '-t': worst = 1 elif option[0] == '-r': testwrite = 0 elif option[0] == '-w': testread = 0 elif option[0] == '-b': bfile = option[1] elif option[0] == '-N': noise = float(option[1]) elif option[0] == '-m': indexmode = option[1] if indexmode not in supported_imodes: raise ValueError("Indexmode should be any of '%s' and you passed '%s'" % (supported_imodes, indexmode)) elif option[0] == '-n': nrows = int(float(option[1])*1000) elif option[0] == '-d': dselect = float(option[1]) elif option[0] == '-k': riter = int(option[1]) # remaining parameters dbfile=pargs[0] if worst: nrows -= 1 # the worst case # Create the benchfile (if needed) if not os.path.exists(bfile): createNewBenchFile(bfile, verbose) if testwrite: if psyco_imported and usepsyco: psyco.bind(createFile) psycon = 1 createFile(dbfile, nrows, None, indexmode, heavy, noise, bfile, verbose) if testread: if psyco_imported and usepsyco: psyco.bind(readFile) psycon = 1 readFile(dbfile, nrows, indexmode, heavy, dselect, bfile, riter)