"""California housing dataset. The original database is available from StatLib http://lib.stat.cmu.edu/ The data contains 20,640 observations on 9 variables. References ---------- Pace, R. Kelley and Ronald Barry, Sparse Spatial Autoregressions, Statistics and Probability Letters, 33 (1997) 291-297. """ # Authors: Peter Prettenhofer # License: Simplified BSD from os.path import join, exists from os import makedirs from cStringIO import StringIO from zipfile import ZipFile import urllib2 import numpy as np from .base import get_data_home, Bunch from ..externals import joblib DATA_URL = "http://lib.stat.cmu.edu/modules.php?op=modload&name=Downloads&"\ "file=index&req=getit&lid=83" TARGET_FILENAME = "cal_housing.pkz" # Grab the module-level docstring to use as a description of the # dataset MODULE_DOCS = __doc__ def fetch_california_housing(data_home=None, download_if_missing=True): """Loader for the California housing dataset from StatLib. Parameters ---------- data_home : optional, default: None Specify another download and cache folder for the datasets. By default all scikit learn data is stored in '~/scikit_learn_data' subfolders. download_if_missing: optional, True by default If False, raise a IOError if the data is not locally available instead of trying to download the data from the source site. Notes ------ This dataset consists of 20,640 samples and 9 features. """ data_home = get_data_home(data_home=data_home) if not exists(data_home): makedirs(data_home) if not exists(join(data_home, TARGET_FILENAME)): print 'downloading Cal. housing from %s to %s' % (DATA_URL, data_home) fhandle = urllib2.urlopen(DATA_URL) buf = StringIO(fhandle.read()) zip_file = ZipFile(buf) try: cadata_fd = zip_file.open('cadata.txt', 'r') cadata = StringIO(cadata_fd.read()) # skip the first 27 lines (documentation) cal_housing = np.loadtxt(cadata, skiprows=27) joblib.dump(cal_housing, join(data_home, TARGET_FILENAME), compress=6) finally: zip_file.close() else: cal_housing = joblib.load(join(data_home, TARGET_FILENAME)) feature_names = ["MedInc", "HouseAge", "AveRooms", "AveBedrms", "Population", "AveOccup", "Latitude", "Longitude"] target, data = cal_housing[:, 0], cal_housing[:, 1:] # avg rooms = total rooms / households data[:, 2] /= data[:, 5] # avg bed rooms = total bed rooms / households data[:, 3] /= data[:, 5] # avg occupancy = population / housholds data[:, 5] = data[:, 4] / data[:, 5] # target in units of 100,000 target = target / 100000.0 return Bunch(data=data, target=target, feature_names=feature_names, DESCR=MODULE_DOCS)