"""
Testing for the tree module (sklearn.tree).
"""

import numpy as np
from numpy.testing import assert_array_equal
from numpy.testing import assert_array_almost_equal
from numpy.testing import assert_almost_equal
from numpy.testing import assert_equal
from nose.tools import assert_raises
from nose.tools import assert_true

from sklearn import tree
from sklearn import datasets
from sklearn.preprocessing import balance_weights

# toy sample
X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]
y = [-1, -1, -1, 1, 1, 1]
T = [[-1, -1], [2, 2], [3, 2]]
true_result = [-1, 1, 1]

# also load the iris dataset
# and randomly permute it
iris = datasets.load_iris()
rng = np.random.RandomState(1)
perm = rng.permutation(iris.target.size)
iris.data = iris.data[perm]
iris.target = iris.target[perm]

# also load the boston dataset
# and randomly permute it
boston = datasets.load_boston()
perm = rng.permutation(boston.target.size)
boston.data = boston.data[perm]
boston.target = boston.target[perm]


def test_classification_toy():
    """Check classification on a toy dataset."""
    # Decision trees
    clf = tree.DecisionTreeClassifier()
    clf.fit(X, y)
    assert_array_equal(clf.predict(T), true_result)

    clf = tree.DecisionTreeClassifier(max_features=1, random_state=1)
    clf.fit(X, y)
    assert_array_equal(clf.predict(T), true_result)

    # Extra-trees
    clf = tree.ExtraTreeClassifier()
    clf.fit(X, y)
    assert_array_equal(clf.predict(T), true_result)

    clf = tree.ExtraTreeClassifier(max_features=1, random_state=1)
    clf.fit(X, y)
    assert_array_equal(clf.predict(T), true_result)


def test_weighted_classification_toy():
    """Check classification on a weighted toy dataset."""
    clf = tree.DecisionTreeClassifier()

    clf.fit(X, y, sample_weight=np.ones(len(X)))
    assert_array_equal(clf.predict(T), true_result)

    clf.fit(X, y, sample_weight=np.ones(len(X)) * 0.5)
    assert_array_equal(clf.predict(T), true_result)


def test_regression_toy():
    """Check regression on a toy dataset."""
    # Decision trees
    clf = tree.DecisionTreeRegressor()
    clf.fit(X, y)
    assert_almost_equal(clf.predict(T), true_result)

    clf = tree.DecisionTreeRegressor(max_features=1, random_state=1)
    clf.fit(X, y)
    assert_almost_equal(clf.predict(T), true_result)

    # Extra-trees
    clf = tree.ExtraTreeRegressor()
    clf.fit(X, y)
    assert_almost_equal(clf.predict(T), true_result)

    clf = tree.ExtraTreeRegressor(max_features=1, random_state=1)
    clf.fit(X, y)
    assert_almost_equal(clf.predict(T), true_result)


def test_xor():
    """Check on a XOR problem"""
    y = np.zeros((10, 10))
    y[:5, :5] = 1
    y[5:, 5:] = 1

    gridx, gridy = np.indices(y.shape)

    X = np.vstack([gridx.ravel(), gridy.ravel()]).T
    y = y.ravel()

    clf = tree.DecisionTreeClassifier()
    clf.fit(X, y)
    assert_equal(clf.score(X, y), 1.0)

    clf = tree.DecisionTreeClassifier(max_features=1)
    clf.fit(X, y)
    assert_equal(clf.score(X, y), 1.0)

    clf = tree.ExtraTreeClassifier()
    clf.fit(X, y)
    assert_equal(clf.score(X, y), 1.0)

    clf = tree.ExtraTreeClassifier(max_features=1)
    clf.fit(X, y)
    assert_equal(clf.score(X, y), 1.0)


def test_graphviz_toy():
    """Check correctness of graphviz output on a toy dataset."""
    clf = tree.DecisionTreeClassifier(max_depth=3, min_samples_split=1)
    clf.fit(X, y)
    from StringIO import StringIO

    # test export code
    out = StringIO()
    tree.export_graphviz(clf, out_file=out)
    contents1 = out.getvalue()

    tree_toy = StringIO(
        "digraph Tree {\n"
        "0 [label=\"X[0] <= 0.0000\\nerror = 0.5"
        "\\nsamples = 6\\nvalue = [ 3.  3.]\", shape=\"box\"] ;\n"
        "1 [label=\"error = 0.0000\\nsamples = 3\\n"
        "value = [ 3.  0.]\", shape=\"box\"] ;\n"
        "0 -> 1 ;\n"
        "2 [label=\"error = 0.0000\\nsamples = 3\\n"
        "value = [ 0.  3.]\", shape=\"box\"] ;\n"
        "0 -> 2 ;\n"
        "}")
    contents2 = tree_toy.getvalue()

    assert contents1 == contents2, \
        "graphviz output test failed\n: %s != %s" % (contents1, contents2)

    # test with feature_names
    out = StringIO()
    out = tree.export_graphviz(clf, out_file=out,
                               feature_names=["feature1", ""])
    contents1 = out.getvalue()

    tree_toy = StringIO(
        "digraph Tree {\n"
        "0 [label=\"feature1 <= 0.0000\\nerror = 0.5"
        "\\nsamples = 6\\nvalue = [ 3.  3.]\", shape=\"box\"] ;\n"
        "1 [label=\"error = 0.0000\\nsamples = 3\\n"
        "value = [ 3.  0.]\", shape=\"box\"] ;\n"
        "0 -> 1 ;\n"
        "2 [label=\"error = 0.0000\\nsamples = 3\\n"
        "value = [ 0.  3.]\", shape=\"box\"] ;\n"
        "0 -> 2 ;\n"
        "}")
    contents2 = tree_toy.getvalue()

    assert contents1 == contents2, \
        "graphviz output test failed\n: %s != %s" % (contents1, contents2)

    # test improperly formed feature_names
    out = StringIO()
    assert_raises(IndexError, tree.export_graphviz,
                  clf, out, feature_names=[])


def test_iris():
    """Check consistency on dataset iris."""
    for c in ('gini',
              'entropy'):
        clf = tree.DecisionTreeClassifier(criterion=c).fit(iris.data,
                                                           iris.target)

        score = np.mean(clf.predict(iris.data) == iris.target)
        assert score > 0.9, "Failed with criterion " + c + \
            " and score = " + str(score)

        clf = tree.DecisionTreeClassifier(criterion=c,
                                          max_features=2,
                                          random_state=1).fit(iris.data,
                                                              iris.target)

        score = np.mean(clf.predict(iris.data) == iris.target)
        assert score > 0.5, "Failed with criterion " + c + \
            " and score = " + str(score)


def test_boston():
    """Check consistency on dataset boston house prices."""
    for c in ('mse',):
        clf = tree.DecisionTreeRegressor(criterion=c).fit(boston.data,
                                                          boston.target)

        score = np.mean(np.power(clf.predict(boston.data) - boston.target, 2))
        assert score < 1, "Failed with criterion " + c + \
            " and score = " + str(score)

        clf = tree.DecisionTreeRegressor(criterion=c,
                                         max_features=6,
                                         random_state=1).fit(boston.data,
                                                             boston.target)

        # using fewer features reduces the learning ability of this tree,
        # but reduces training time.
        score = np.mean(np.power(clf.predict(boston.data) - boston.target, 2))
        assert score < 2, "Failed with criterion " + c + \
            " and score = " + str(score)


def test_probability():
    """Predict probabilities using DecisionTreeClassifier."""
    clf = tree.DecisionTreeClassifier(max_depth=1, max_features=1,
                                      random_state=42)
    clf.fit(iris.data, iris.target)

    prob_predict = clf.predict_proba(iris.data)
    assert_array_almost_equal(
        np.sum(prob_predict, 1), np.ones(iris.data.shape[0]))
    assert np.mean(np.argmax(prob_predict, 1)
                   == clf.predict(iris.data)) > 0.9

    assert_almost_equal(clf.predict_proba(iris.data),
                        np.exp(clf.predict_log_proba(iris.data)), 8)


def test_arrayrepr():
    """Check the array representation."""
    # Check resize
    clf = tree.DecisionTreeRegressor(max_depth=None)
    X = np.arange(10000)[:, np.newaxis]
    y = np.arange(10000)
    clf.fit(X, y)


def test_pure_set():
    """Check when y is pure."""
    X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]
    y = [1, 1, 1, 1, 1, 1]

    clf = tree.DecisionTreeClassifier().fit(X, y)
    assert_array_equal(clf.predict(X), y)

    clf = tree.DecisionTreeRegressor().fit(X, y)
    assert_array_equal(clf.predict(X), y)


def test_numerical_stability():
    """Check numerical stability."""
    old_settings = np.geterr()
    np.seterr(all="raise")

    X = np.array([
        [152.08097839, 140.40744019, 129.75102234, 159.90493774],
        [142.50700378, 135.81935120, 117.82884979, 162.75781250],
        [127.28772736, 140.40744019, 129.75102234, 159.90493774],
        [132.37025452, 143.71923828, 138.35694885, 157.84558105],
        [103.10237122, 143.71928406, 138.35696411, 157.84559631],
        [127.71276855, 143.71923828, 138.35694885, 157.84558105],
        [120.91514587, 140.40744019, 129.75102234, 159.90493774]])

    y = np.array(
        [1., 0.70209277, 0.53896582, 0., 0.90914464, 0.48026916,  0.49622521])

    dt = tree.DecisionTreeRegressor()
    dt.fit(X, y)
    dt.fit(X, -y)
    dt.fit(-X, y)
    dt.fit(-X, -y)

    np.seterr(**old_settings)


def test_importances():
    """Check variable importances."""
    X, y = datasets.make_classification(n_samples=1000,
                                        n_features=10,
                                        n_informative=3,
                                        n_redundant=0,
                                        n_repeated=0,
                                        shuffle=False,
                                        random_state=0)

    clf = tree.DecisionTreeClassifier(compute_importances=True)
    clf.fit(X, y)
    importances = clf.feature_importances_
    n_important = sum(importances > 0.1)

    assert_equal(importances.shape[0], 10)
    assert_equal(n_important, 3)

    X_new = clf.transform(X, threshold="mean")
    assert 0 < X_new.shape[1] < X.shape[1]

    clf = tree.DecisionTreeClassifier()
    clf.fit(X, y)
    assert_true(clf.feature_importances_ is None)


def test_error():
    """Test that it gives proper exception on deficient input."""
    # Invalid values for parameters
    assert_raises(ValueError,
                  tree.DecisionTreeClassifier(min_samples_leaf=-1).fit,
                  X, y)

    assert_raises(ValueError,
                  tree.DecisionTreeClassifier(min_samples_split=-1).fit,
                  X, y)

    assert_raises(ValueError,
                  tree.DecisionTreeClassifier(max_depth=-1).fit,
                  X, y)

    assert_raises(ValueError,
                  tree.DecisionTreeClassifier(min_density=2.0).fit,
                  X, y)

    assert_raises(ValueError,
                  tree.DecisionTreeClassifier(max_features=42).fit,
                  X, y)

    # Wrong dimensions
    clf = tree.DecisionTreeClassifier()
    y2 = y[:-1]
    assert_raises(ValueError, clf.fit, X, y2)

    # Test with arrays that are non-contiguous.
    Xf = np.asfortranarray(X)
    clf = tree.DecisionTreeClassifier()
    clf.fit(Xf, y)
    assert_array_equal(clf.predict(T), true_result)

    # predict before fitting
    clf = tree.DecisionTreeClassifier()
    assert_raises(Exception, clf.predict, T)

    # predict on vector with different dims
    clf.fit(X, y)
    t = np.asarray(T)
    assert_raises(ValueError, clf.predict, t[:, 1:])

   # use values of max_features that are invalid
    clf = tree.DecisionTreeClassifier(max_features=10)
    assert_raises(ValueError, clf.fit, X, y)

    clf = tree.DecisionTreeClassifier(max_features=-1)
    assert_raises(ValueError, clf.fit, X, y)

    clf = tree.DecisionTreeClassifier(max_features="foobar")
    assert_raises(ValueError, clf.fit, X, y)

    tree.DecisionTreeClassifier(max_features="auto").fit(X, y)
    tree.DecisionTreeClassifier(max_features="sqrt").fit(X, y)
    tree.DecisionTreeClassifier(max_features="log2").fit(X, y)
    tree.DecisionTreeClassifier(max_features=None).fit(X, y)

    # predict before fit
    clf = tree.DecisionTreeClassifier()
    assert_raises(Exception, clf.predict_proba, X)

    clf.fit(X, y)
    X2 = [-2, -1, 1]  # wrong feature shape for sample
    assert_raises(ValueError, clf.predict_proba, X2)

    # wrong sample shape
    Xt = np.array(X).T

    clf = tree.DecisionTreeClassifier()
    clf.fit(np.dot(X, Xt), y)
    assert_raises(ValueError, clf.predict, X)

    clf = tree.DecisionTreeClassifier()
    clf.fit(X, y)
    assert_raises(ValueError, clf.predict, Xt)

    # wrong length of sample mask
    clf = tree.DecisionTreeClassifier()
    sample_mask = np.array([1])
    assert_raises(ValueError, clf.fit, X, y, sample_mask=sample_mask)

    # wrong length of X_argsorted
    clf = tree.DecisionTreeClassifier()
    X_argsorted = np.array([1])
    assert_raises(ValueError, clf.fit, X, y, X_argsorted=X_argsorted)


def test_min_samples_leaf():
    """Test if leaves contain more than leaf_count training examples"""
    X = np.asfortranarray(iris.data.astype(tree._tree.DTYPE))
    y = iris.target

    for tree_class in [tree.DecisionTreeClassifier, tree.ExtraTreeClassifier]:
        clf = tree_class(min_samples_leaf=5).fit(X, y)

        out = clf.tree_.apply(X)
        node_counts = np.bincount(out)
        leaf_count = node_counts[node_counts != 0]  # drop inner nodes

        assert np.min(leaf_count) >= 5


def test_pickle():
    import pickle

    # classification
    obj = tree.DecisionTreeClassifier()
    obj.fit(iris.data, iris.target)
    score = obj.score(iris.data, iris.target)
    s = pickle.dumps(obj)

    obj2 = pickle.loads(s)
    assert_equal(type(obj2), obj.__class__)
    score2 = obj2.score(iris.data, iris.target)
    assert score == score2, "Failed to generate same score " + \
        " after pickling (classification) "

    # regression
    obj = tree.DecisionTreeRegressor()
    obj.fit(boston.data, boston.target)
    score = obj.score(boston.data, boston.target)
    s = pickle.dumps(obj)

    obj2 = pickle.loads(s)
    assert_equal(type(obj2), obj.__class__)
    score2 = obj2.score(boston.data, boston.target)
    assert score == score2, "Failed to generate same score " + \
        " after pickling (regression) "


def test_multioutput():
    """Check estimators on multi-output problems."""

    X = [[-2, -1],
         [-1, -1],
         [-1, -2],
         [1, 1],
         [1, 2],
         [2, 1],
         [-2, 1],
         [-1, 1],
         [-1, 2],
         [2, -1],
         [1, -1],
         [1, -2]]

    y = [[-1, 0],
         [-1, 0],
         [-1, 0],
         [1, 1],
         [1, 1],
         [1, 1],
         [-1, 2],
         [-1, 2],
         [-1, 2],
         [1, 3],
         [1, 3],
         [1, 3]]

    T = [[-1, -1], [1, 1], [-1, 1], [1, -1]]
    y_true = [[-1, 0], [1, 1], [-1, 2], [1, 3]]

    # toy classification problem
    clf = tree.DecisionTreeClassifier()
    y_hat = clf.fit(X, y).predict(T)
    assert_array_equal(y_hat, y_true)
    assert_equal(y_hat.shape, (4, 2))

    proba = clf.predict_proba(T)
    assert_equal(len(proba), 2)
    assert_equal(proba[0].shape, (4, 2))
    assert_equal(proba[1].shape, (4, 4))

    log_proba = clf.predict_log_proba(T)
    assert_equal(len(log_proba), 2)
    assert_equal(log_proba[0].shape, (4, 2))
    assert_equal(log_proba[1].shape, (4, 4))

    # toy regression problem
    clf = tree.DecisionTreeRegressor()
    y_hat = clf.fit(X, y).predict(T)
    assert_almost_equal(y_hat, y_true)
    assert_equal(y_hat.shape, (4, 2))


def test_sample_mask():
    """Test sample_mask argument. """
    # test list sample_mask
    clf = tree.DecisionTreeClassifier()
    sample_mask = [1] * len(X)
    clf.fit(X, y, sample_mask=sample_mask)
    assert_array_equal(clf.predict(T), true_result)

    # test different dtype
    clf = tree.DecisionTreeClassifier()
    sample_mask = np.ones((len(X),), dtype=np.int32)
    clf.fit(X, y, sample_mask=sample_mask)
    assert_array_equal(clf.predict(T), true_result)


def test_X_argsorted():
    """Test X_argsorted argument. """
    # test X_argsorted with different layout and dtype
    clf = tree.DecisionTreeClassifier()
    X_argsorted = np.argsort(np.array(X).T, axis=1).T
    clf.fit(X, y, X_argsorted=X_argsorted)
    assert_array_equal(clf.predict(T), true_result)


def test_classes_shape():
    """Test that n_classes_ and classes_ have proper shape."""
    # Classification, single output
    clf = tree.DecisionTreeClassifier()
    clf.fit(X, y)

    assert_equal(clf.n_classes_, 2)
    assert_equal(clf.classes_, [-1, 1])

    # Classification, multi-output
    _y = np.vstack((y, np.array(y) * 2)).T
    clf = tree.DecisionTreeClassifier()
    clf.fit(X, _y)

    assert_equal(len(clf.n_classes_), 2)
    assert_equal(len(clf.classes_), 2)
    assert_equal(clf.n_classes_, [2, 2])
    assert_equal(clf.classes_, [[-1, 1], [-2, 2]])


def test_unbalanced_iris():
    """Check class rebalancing."""
    unbalanced_X = iris.data[:125]
    unbalanced_y = iris.target[:125]
    sample_weight = balance_weights(unbalanced_y)

    clf = tree.DecisionTreeClassifier()
    clf.fit(unbalanced_X, unbalanced_y, sample_weight=sample_weight)
    assert_almost_equal(clf.predict(unbalanced_X), unbalanced_y)


def test_sample_weight():
    """Check sample weighting."""
    # Test that zero-weighted samples are not taken into account
    X = np.arange(100)[:, np.newaxis]
    y = np.ones(100)
    y[:50] = 0.0

    sample_weight = np.ones(100)
    sample_weight[y == 0] = 0.0

    clf = tree.DecisionTreeClassifier()
    clf.fit(X, y, sample_weight=sample_weight)
    assert_array_equal(clf.predict(X), np.ones(100))

    # Test that low weighted samples are not taken into account at low depth
    X = np.arange(200)[:, np.newaxis]
    y = np.zeros(200)
    y[50:100] = 1
    y[100:200] = 2
    X[100:200, 0] = 200

    sample_weight = np.ones(200)

    sample_weight[y == 2] = .51  # Samples of class '2' are still weightier
    clf = tree.DecisionTreeClassifier(max_depth=1)
    clf.fit(X, y, sample_weight=sample_weight)
    assert_equal(clf.tree_.threshold[0], 149.5)

    sample_weight[y == 2] = .50  # Samples of class '2' are no longer weightier
    clf = tree.DecisionTreeClassifier(max_depth=1)
    clf.fit(X, y, sample_weight=sample_weight)
    assert_equal(clf.tree_.threshold[0], 49.5)  # Threshold should have moved

    # Test that sample weighting is the same as having duplicates
    X = iris.data
    y = iris.target

    duplicates = rng.randint(0, X.shape[0], 1000)

    clf = tree.DecisionTreeClassifier(random_state=1)
    clf.fit(X[duplicates], y[duplicates])

    from sklearn.utils.fixes import bincount
    sample_weight = bincount(duplicates, minlength=X.shape[0])
    clf2 = tree.DecisionTreeClassifier(random_state=1)
    clf2.fit(X, y, sample_weight=sample_weight)

    internal = clf.tree_.children_left != tree._tree.TREE_LEAF
    assert_array_equal(clf.tree_.threshold[internal],
                       clf2.tree_.threshold[internal])

    # Test negative weights
    X = iris.data
    y = iris.target

    sample_weight = -np.ones(X.shape[0])
    clf = tree.DecisionTreeClassifier(random_state=1)
    assert_raises(ValueError, clf.fit, X, y, sample_weight=sample_weight)

    sample_weight = np.ones(X.shape[0])
    sample_weight[0] = -1
    clf = tree.DecisionTreeClassifier(random_state=1)
    clf.fit(X, y, sample_weight=sample_weight)

    # Check that predict_proba returns valid probabilities in the presence of
    # samples with negative weight
    X = iris.data
    y = iris.target

    sample_weight = rng.normal(.5, 1.0, X.shape[0])
    clf = tree.DecisionTreeClassifier(random_state=1)
    clf.fit(X, y, sample_weight=sample_weight)
    proba = clf.predict_proba(X)
    assert (proba >= 0).all() and (proba <= 1).all()


if __name__ == "__main__":
    import nose
    nose.runmodule()