import numpy as np import tables from time import time N = 1000*1000 NCOLL = 200 # 200 collections maximum # In order to have reproducible results np.random.seed(19) class Energies(tables.IsDescription): collection = tables.UInt8Col() energy = tables.Float64Col() def fill_bucket(lbucket): #c = np.random.normal(NCOLL/2, NCOLL/10, lbucket) c = np.random.normal(NCOLL/2, NCOLL/100, lbucket) e = np.arange(lbucket, dtype='f8') return c, e # Fill the table t1 = time() f = tables.open_file("data.nobackup/collations.h5", "w") table = f.create_table("/", "Energies", Energies, expectedrows=N) # Fill the table with values lbucket = 1000 # Fill in buckets of 1000 rows, for speed for i in xrange(0, N, lbucket): bucket = fill_bucket(lbucket) table.append(bucket) # Fill the remaining rows bucket = fill_bucket(N%lbucket) table.append(bucket) f.close() print "Time to create the table with %d entries: %.3f" % (N, time()-t1) # Now, read the table and group it by collection f = tables.open_file("data.nobackup/collations.h5", "a") table = f.root.Energies ######################################################### # First solution: load the table completely in memory ######################################################### t1 = time() t = table[:] # convert to structured array coll1 = [] collections = np.unique(t['collection']) for c in collections: cond = t['collection'] == c energy_this_collection = t['energy'][cond] sener = energy_this_collection.sum() coll1.append(sener) print c, ' : ', sener del collections, energy_this_collection print "Time for first solution: %.3f" % (time()-t1) ######################################################### # Second solution: load all the collections in memory ######################################################### t1 = time() collections = {} for row in table: c = row['collection'] e = row['energy'] if c in collections: collections[c].append(e) else: collections[c] = [e] # Convert the lists in numpy arrays coll2 = [] for c in sorted(collections): energy_this_collection = np.array(collections[c]) sener = energy_this_collection.sum() coll2.append(sener) print c, ' : ', sener del collections, energy_this_collection print "Time for second solution: %.3f" % (time()-t1) t1 = time() table.cols.collection.create_csindex() #table.cols.collection.reindex() print "Time for indexing: %.3f" % (time()-t1) ######################################################### # Third solution: load each collection separately ######################################################### t1 = time() coll3 = [] for c in np.unique(table.col('collection')) : energy_this_collection = table.read_where('collection == c', field='energy') sener = energy_this_collection.sum() coll3.append(sener) print c, ' : ', sener del energy_this_collection print "Time for third solution: %.3f" % (time()-t1) t1 = time() table2 = table.copy('/', 'EnergySortedByCollation', overwrite=True, sortby="collection", propindexes=True) print "Time for sorting: %.3f" % (time()-t1) ##################################################################### # Fourth solution: load each collection separately. Sorted table. ##################################################################### t1 = time() coll4 = [] for c in np.unique(table2.col('collection')) : energy_this_collection = table2.read_where( 'collection == c', field='energy') sener = energy_this_collection.sum() coll4.append(sener) print c, ' : ', sener del energy_this_collection print "Time for fourth solution: %.3f" % (time()-t1) # Finally, check that all solutions do match assert coll1 == coll2 == coll3 == coll4 f.close()