library(itertools) # Write a data frame to disk, using one file per column. # The optional fprefix argument is used to partially # specify the name of these files. writedf <- function(df, fprefix=as.character(substitute(df))) { # Handle fprefix argument if (! is.character(fprefix)) stop('fprefix must be character') if (length(fprefix) == 1) fprefix <- sprintf('%s_%02d', fprefix, seq_along(df)) # Write each column of "df" to a file for (icol in seq_along(df)) { p <- fprefix[icol] cls <- class(df[[icol]])[1] type <- if (cls == 'numeric') 'double' else cls if (type == 'factor') { writeBin(as.integer(df[[icol]]), sprintf('%s_factor.col', p)) writeLines(levels(df[[icol]]), sprintf('%s_factor.lev', p)) } else if (type == 'character') { writeLines(df[[icol]], sprintf('%s_character.col', p)) } else if (type == 'integer') { writeBin(df[[icol]], sprintf('%s_integer.col', p)) } else if (type == 'double') { writeBin(df[[icol]], sprintf('%s_double.col', p)) } else { stop('unsupported type: ', type) } } } # Read a data frame from files containing column data. # The arguments "n", "start", and "col.names" can be used # to read part of the data frame. # # Although you must specify a value for "n", you can # specify a value that is larger than the number of elements # in the files. This will allocate too much memory, so # try not to overestimate by too much. readdf <- function(filenames, n, start=1, col.names) { it <- ireaddf(filenames, n, start, col.names, chunkSize=n) df <- nextElem(it) tryCatch(nextElem(it), error=function(e) invisible()) df } testreaddf <- function(n=1000, s=c(1, 8, 9, 12)) { filenames <- Sys.glob('fifty1_*.col') col.names <- c('AOU', 'RouteDataID', 'countrynum', 'statenum', 'Route', 'RPID', 'year', 'rteNo', 'species', 'stopNo', 'count', 'rtestopNo') readdf(filenames[s], n, col.names=col.names[s]) } testireaddf <- function(n=200000000, chunkSize=1000000) { library(foreach) filenames <- Sys.glob('fifty1_*.col') col.names <- c('AOU', 'RouteDataID', 'countrynum', 'statenum', 'Route', 'RPID', 'year', 'rteNo', 'species', 'stopNo', 'count', 'rtestopNo') nfilenames <- sub('fifty1', 'mod', filenames, fixed=TRUE) print(nfilenames) cobj <- writedf.combiner(nfilenames) # Copy the sub-data frames from one set of files to another cat('Starting to read "fifty1" data files...\n') foreach(df=ireaddf(filenames, n, col.names=col.names, chunkSize=chunkSize), .combine=cobj$combine, .maxcombine=3) %do% { df } cobj$close() # Check that the two sets of files contain identical data frames cat('Compare the two sets of data files...\n') r <- foreach(df1=ireaddf(filenames, n, col.names=col.names, chunkSize=chunkSize), df2=ireaddf(nfilenames, n, col.names=col.names, chunkSize=chunkSize), .combine='all', .maxcombine=50) %do% { identical(df1, df2) } print(r) cat('Finished\n') }