""" Testing for the gradient boosting module (sklearn.ensemble.gradient_boosting). """ import numpy as np from numpy.testing import assert_array_equal from numpy.testing import assert_array_almost_equal from numpy.testing import assert_equal from nose.tools import assert_raises, assert_true from sklearn.metrics import mean_squared_error from sklearn.utils import check_random_state from sklearn.ensemble import GradientBoostingClassifier from sklearn.ensemble import GradientBoostingRegressor from sklearn import datasets # toy sample X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]] y = [-1, -1, -1, 1, 1, 1] T = [[-1, -1], [2, 2], [3, 2]] true_result = [-1, 1, 1] rng = np.random.RandomState(0) # also load the boston dataset # and randomly permute it boston = datasets.load_boston() perm = rng.permutation(boston.target.size) boston.data = boston.data[perm] boston.target = boston.target[perm] # also load the iris dataset # and randomly permute it iris = datasets.load_iris() perm = rng.permutation(iris.target.size) iris.data = iris.data[perm] iris.target = iris.target[perm] def test_classification_toy(): """Check classification on a toy dataset.""" clf = GradientBoostingClassifier(n_estimators=100, random_state=1) assert_raises(ValueError, clf.predict, T) clf.fit(X, y) assert_array_equal(clf.predict(T), true_result) assert_equal(100, len(clf.estimators_)) deviance_decrease = (clf.train_score_[:-1] - clf.train_score_[1:]) assert np.any(deviance_decrease >= 0.0), \ "Train deviance does not monotonically decrease." def test_parameter_checks(): """Check input parameter validation.""" assert_raises(ValueError, GradientBoostingClassifier(n_estimators=0).fit, X, y) assert_raises(ValueError, GradientBoostingClassifier(n_estimators=-1).fit, X, y) assert_raises(ValueError, GradientBoostingClassifier(learning_rate=0.0).fit, X, y) assert_raises(ValueError, GradientBoostingClassifier(learning_rate=-1.0).fit, X, y) assert_raises(ValueError, GradientBoostingClassifier(loss='foobar').fit, X, y) assert_raises(ValueError, GradientBoostingClassifier(min_samples_split=0.0).fit, X, y) assert_raises(ValueError, GradientBoostingClassifier(min_samples_split=-1.0).fit, X, y) assert_raises(ValueError, GradientBoostingClassifier(min_samples_leaf=0).fit, X, y) assert_raises(ValueError, GradientBoostingClassifier(min_samples_leaf=-1.).fit, X, y) assert_raises(ValueError, GradientBoostingClassifier(subsample=0.0).fit, X, y) assert_raises(ValueError, GradientBoostingClassifier(subsample=1.1).fit, X, y) assert_raises(ValueError, GradientBoostingClassifier(subsample=-0.1).fit, X, y) assert_raises(ValueError, GradientBoostingClassifier(max_depth=-0.1).fit, X, y) assert_raises(ValueError, GradientBoostingClassifier(max_depth=0).fit, X, y) assert_raises(ValueError, GradientBoostingClassifier(init={}).fit, X, y) # test fit before feature importance assert_raises(ValueError, lambda: GradientBoostingClassifier().feature_importances_) # binomial deviance requires ``n_classes == 2``. assert_raises(ValueError, lambda X, y: GradientBoostingClassifier( loss='bdeviance').fit(X, y), X, [0, 0, 1, 1, 2, 2]) # multinomial deviance requires ``n_classes > 2``. assert_raises(ValueError, lambda X, y: GradientBoostingClassifier( loss='mdeviance').fit(X, y), X, [0, 0, 1, 1, 1, 0]) # deviance requires ``n_classes >= 2``. assert_raises(ValueError, lambda X, y: GradientBoostingClassifier( loss='deviance').fit(X, y), X, [0, 0, 0, 0]) def test_classification_synthetic(): """Test GradientBoostingClassifier on synthetic dataset used by Hastie et al. in ESLII Example 12.7. """ X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1) X_train, X_test = X[:2000], X[2000:] y_train, y_test = y[:2000], y[2000:] gbrt = GradientBoostingClassifier(n_estimators=100, min_samples_split=1, max_depth=1, learning_rate=1.0, random_state=0) gbrt.fit(X_train, y_train) error_rate = (1.0 - gbrt.score(X_test, y_test)) assert error_rate < 0.085, \ "GB failed with error %.4f" % error_rate gbrt = GradientBoostingClassifier(n_estimators=200, min_samples_split=1, max_depth=1, learning_rate=1.0, subsample=0.5, random_state=0) gbrt.fit(X_train, y_train) error_rate = (1.0 - gbrt.score(X_test, y_test)) assert error_rate < 0.08, \ "Stochastic GB failed with error %.4f" % error_rate def test_boston(): """Check consistency on dataset boston house prices with least squares and least absolute deviation. """ for loss in ("ls", "lad", "huber"): clf = GradientBoostingRegressor(n_estimators=100, loss=loss, max_depth=4, min_samples_split=1, random_state=1) assert_raises(ValueError, clf.predict, boston.data) clf.fit(boston.data, boston.target) y_pred = clf.predict(boston.data) mse = mean_squared_error(boston.target, y_pred) assert mse < 6.0, "Failed with loss %s and mse = %.4f" % (loss, mse) def test_iris(): """Check consistency on dataset iris.""" for subsample in (1.0, 0.5): clf = GradientBoostingClassifier(n_estimators=100, loss='deviance', random_state=1, subsample=subsample) clf.fit(iris.data, iris.target) score = clf.score(iris.data, iris.target) assert score > 0.9, "Failed with subsample %.1f " \ "and score = %f" % (subsample, score) def test_regression_synthetic(): """Test on synthetic regression datasets used in Leo Breiman, `Bagging Predictors?. Machine Learning 24(2): 123-140 (1996). """ random_state = check_random_state(1) regression_params = {'n_estimators': 100, 'max_depth': 4, 'min_samples_split': 1, 'learning_rate': 0.1, 'loss': 'ls'} # Friedman1 X, y = datasets.make_friedman1(n_samples=1200, random_state=random_state, noise=1.0) X_train, y_train = X[:200], y[:200] X_test, y_test = X[200:], y[200:] clf = GradientBoostingRegressor() clf.fit(X_train, y_train) mse = mean_squared_error(y_test, clf.predict(X_test)) assert mse < 5.0, "Failed on Friedman1 with mse = %.4f" % mse # Friedman2 X, y = datasets.make_friedman2(n_samples=1200, random_state=random_state) X_train, y_train = X[:200], y[:200] X_test, y_test = X[200:], y[200:] clf = GradientBoostingRegressor(**regression_params) clf.fit(X_train, y_train) mse = mean_squared_error(y_test, clf.predict(X_test)) assert mse < 1700.0, "Failed on Friedman2 with mse = %.4f" % mse # Friedman3 X, y = datasets.make_friedman3(n_samples=1200, random_state=random_state) X_train, y_train = X[:200], y[:200] X_test, y_test = X[200:], y[200:] clf = GradientBoostingRegressor(**regression_params) clf.fit(X_train, y_train) mse = mean_squared_error(y_test, clf.predict(X_test)) assert mse < 0.015, "Failed on Friedman3 with mse = %.4f" % mse def test_feature_importances(): X = np.array(boston.data, dtype=np.float32) y = np.array(boston.target, dtype=np.float32) clf = GradientBoostingRegressor(n_estimators=100, max_depth=5, min_samples_split=1, random_state=1) clf.fit(X, y) #feature_importances = clf.feature_importances_ assert_true(hasattr(clf, 'feature_importances_')) # true feature importance ranking # true_ranking = np.array([3, 1, 8, 2, 10, 9, 4, 11, 0, 6, 7, 5, 12]) # assert_array_equal(true_ranking, feature_importances.argsort()) def test_probability(): """Predict probabilities.""" clf = GradientBoostingClassifier(n_estimators=100, random_state=1) assert_raises(ValueError, clf.predict_proba, T) clf.fit(X, y) assert_array_equal(clf.predict(T), true_result) # check if probabilities are in [0, 1]. y_proba = clf.predict_proba(T) assert np.all(y_proba >= 0.0) assert np.all(y_proba <= 1.0) # derive predictions from probabilities y_pred = clf.classes_.take(y_proba.argmax(axis=1), axis=0) assert_array_equal(y_pred, true_result) def test_check_inputs(): """Test input checks (shape and type of X and y).""" clf = GradientBoostingClassifier(n_estimators=100, random_state=1) assert_raises(ValueError, clf.fit, X, y + [0, 1]) from scipy import sparse X_sparse = sparse.csr_matrix(X) clf = GradientBoostingClassifier(n_estimators=100, random_state=1) assert_raises(TypeError, clf.fit, X_sparse, y) clf = GradientBoostingClassifier().fit(X, y) assert_raises(TypeError, clf.predict, X_sparse) def test_check_inputs_predict(): """X has wrong shape """ clf = GradientBoostingClassifier(n_estimators=100, random_state=1) clf.fit(X, y) x = np.array([1.0, 2.0])[:, np.newaxis] assert_raises(ValueError, clf.predict, x) x = np.array([]) assert_raises(ValueError, clf.predict, x) x = np.array([1.0, 2.0, 3.0])[:, np.newaxis] assert_raises(ValueError, clf.predict, x) clf = GradientBoostingRegressor(n_estimators=100, random_state=1) clf.fit(X, rng.rand(len(X))) x = np.array([1.0, 2.0])[:, np.newaxis] assert_raises(ValueError, clf.predict, x) x = np.array([]) assert_raises(ValueError, clf.predict, x) x = np.array([1.0, 2.0, 3.0])[:, np.newaxis] assert_raises(ValueError, clf.predict, x) def test_check_max_features(): """test if max_features is valid. """ clf = GradientBoostingRegressor(n_estimators=100, random_state=1, max_features=0) assert_raises(ValueError, clf.fit, X, y) clf = GradientBoostingRegressor(n_estimators=100, random_state=1, max_features=(len(X[0]) + 1)) assert_raises(ValueError, clf.fit, X, y) def test_max_feature_regression(): """Test to make sure random state is set properly. """ X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1) X_train, X_test = X[:2000], X[2000:] y_train, y_test = y[:2000], y[2000:] gbrt = GradientBoostingClassifier(n_estimators=100, min_samples_split=5, max_depth=2, learning_rate=.1, max_features=2, random_state=1) gbrt.fit(X_train, y_train) deviance = gbrt.loss_(y_test, gbrt.decision_function(X_test)) assert_true(deviance < 0.5, "GB failed with deviance %.4f" % deviance) def test_staged_predict(): """Test whether staged decision function eventually gives the same prediction. """ X, y = datasets.make_friedman1(n_samples=1200, random_state=1, noise=1.0) X_train, y_train = X[:200], y[:200] X_test, y_test = X[200:], y[200:] clf = GradientBoostingRegressor() # test raise ValueError if not fitted assert_raises(ValueError, lambda X: np.fromiter( clf.staged_predict(X), dtype=np.float64), X_test) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) # test if prediction for last stage equals ``predict`` for y in clf.staged_predict(X_test): assert_equal(y.shape, y_pred.shape) assert_array_equal(y_pred, y) def test_staged_predict_proba(): """Test whether staged predict proba eventually gives the same prediction. """ X, y = datasets.make_hastie_10_2(n_samples=1200, random_state=1) X_train, y_train = X[:200], y[:200] X_test, y_test = X[200:], y[200:] clf = GradientBoostingClassifier(n_estimators=20) # test raise ValueError if not fitted assert_raises(ValueError, lambda X: np.fromiter( clf.staged_predict_proba(X), dtype=np.float64), X_test) clf.fit(X_train, y_train) # test if prediction for last stage equals ``predict`` for y_pred in clf.staged_predict(X_test): assert_equal(y_test.shape, y_pred.shape) assert_array_equal(clf.predict(X_test), y_pred) # test if prediction for last stage equals ``predict_proba`` for staged_proba in clf.staged_predict_proba(X_test): assert_equal(y_test.shape[0], staged_proba.shape[0]) assert_equal(2, staged_proba.shape[1]) assert_array_equal(clf.predict_proba(X_test), staged_proba) def test_serialization(): """Check model serialization.""" clf = GradientBoostingClassifier(n_estimators=100, random_state=1) clf.fit(X, y) assert_array_equal(clf.predict(T), true_result) assert_equal(100, len(clf.estimators_)) try: import cPickle as pickle except ImportError: import pickle serialized_clf = pickle.dumps(clf, protocol=pickle.HIGHEST_PROTOCOL) clf = None clf = pickle.loads(serialized_clf) assert_array_equal(clf.predict(T), true_result) assert_equal(100, len(clf.estimators_)) def test_degenerate_targets(): """Check if we can fit even though all targets are equal. """ clf = GradientBoostingClassifier(n_estimators=100, random_state=1) # classifier should raise exception assert_raises(ValueError, clf.fit, X, np.ones(len(X))) clf = GradientBoostingRegressor(n_estimators=100, random_state=1) clf.fit(X, np.ones(len(X))) clf.predict(rng.rand(2)) assert_array_equal(np.ones((1,), dtype=np.float64), clf.predict(rng.rand(2))) def test_quantile_loss(): """Check if quantile loss with alpha=0.5 equals lad. """ clf_quantile = GradientBoostingRegressor(n_estimators=100, loss='quantile', max_depth=4, alpha=0.5, random_state=7) clf_quantile.fit(boston.data, boston.target) y_quantile = clf_quantile.predict(boston.data) clf_lad = GradientBoostingRegressor(n_estimators=100, loss='lad', max_depth=4, random_state=7) clf_lad.fit(boston.data, boston.target) y_lad = clf_lad.predict(boston.data) assert_array_almost_equal(y_quantile, y_lad, decimal=4) def test_symbol_labels(): """Test with non-integer class labels. """ clf = GradientBoostingClassifier(n_estimators=100, random_state=1) symbol_y = map(str, y) clf.fit(X, symbol_y) assert_array_equal(clf.predict(T), map(str, true_result)) assert_equal(100, len(clf.estimators_)) def test_float_class_labels(): """Test with float class labels. """ clf = GradientBoostingClassifier(n_estimators=100, random_state=1) float_y = np.asarray(y, dtype=np.float32) clf.fit(X, float_y) assert_array_equal(clf.predict(T), np.asarray(true_result, dtype=np.float32)) assert_equal(100, len(clf.estimators_)) def test_shape_y(): """Test with float class labels. """ clf = GradientBoostingClassifier(n_estimators=100, random_state=1) y_ = np.asarray(y, dtype=np.int32) y_ = y_[:, np.newaxis] clf.fit(X, y_) assert_array_equal(clf.predict(T), true_result) assert_equal(100, len(clf.estimators_)) def test_mem_layout(): """Test with different memory layouts of X and y""" X_ = np.asfortranarray(X) clf = GradientBoostingClassifier(n_estimators=100, random_state=1) clf.fit(X_, y) assert_array_equal(clf.predict(T), true_result) assert_equal(100, len(clf.estimators_)) X_ = np.ascontiguousarray(X) clf = GradientBoostingClassifier(n_estimators=100, random_state=1) clf.fit(X_, y) assert_array_equal(clf.predict(T), true_result) assert_equal(100, len(clf.estimators_)) y_ = np.asarray(y, dtype=np.int32) y_ = y_[:, np.newaxis] y_ = np.ascontiguousarray(y_) clf = GradientBoostingClassifier(n_estimators=100, random_state=1) clf.fit(X, y_) assert_array_equal(clf.predict(T), true_result) assert_equal(100, len(clf.estimators_)) y_ = np.asarray(y, dtype=np.int32) y_ = y_[:, np.newaxis] y_ = np.asfortranarray(y_) clf = GradientBoostingClassifier(n_estimators=100, random_state=1) clf.fit(X, y_) assert_array_equal(clf.predict(T), true_result) assert_equal(100, len(clf.estimators_)) def test_min_density(): """Check if min_density is properly set when growing deep trees.""" clf = GradientBoostingClassifier(max_depth=6) clf.fit(X, y) assert clf.min_density == 0.1 clf = GradientBoostingClassifier(max_depth=5) clf.fit(X, y) assert clf.min_density == 0.0