# Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>
#         Fabian Pedregosa <fabian.pedregosa@inria.fr>
#         Olivier Grisel <olivier.grisel@ensta.org>
#         Gael Varoquaux <gael.varoquaux@inria.fr>
#
# License: BSD Style.

import sys
import warnings
import itertools
import operator
from abc import ABCMeta, abstractmethod

import numpy as np
from scipy import sparse

from .base import LinearModel
from ..base import RegressorMixin
from .base import sparse_center_data, center_data
from ..utils import array2d, atleast2d_or_csc, deprecated
from ..cross_validation import check_cv
from ..externals.joblib import Parallel, delayed
from ..utils.extmath import safe_sparse_dot

from . import cd_fast


###############################################################################
# ElasticNet model


class ElasticNet(LinearModel, RegressorMixin):
    """Linear Model trained with L1 and L2 prior as regularizer

    Minimizes the objective function::

            1 / (2 * n_samples) * ||y - Xw||^2_2 +
            + alpha * l1_ratio * ||w||_1
            + 0.5 * alpha * (1 - l1_ratio) * ||w||^2_2

    If you are interested in controlling the L1 and L2 penalty
    separately, keep in mind that this is equivalent to::

            a * L1 + b * L2

    where::

            alpha = a + b and l1_ratio = a / (a + b)

    The parameter l1_ratio corresponds to alpha in the glmnet R package while
    alpha corresponds to the lambda parameter in glmnet. Specifically, l1_ratio
    = 1 is the lasso penalty. Currently, l1_ratio <= 0.01 is not reliable,
    unless you supply your own sequence of alpha.

    Parameters
    ----------
    alpha : float
        Constant that multiplies the penalty terms. Defaults to 1.0
        See the notes for the exact mathematical meaning of this
        parameter
        alpha = 0 is equivalent to an ordinary least square, solved
        by the LinearRegression object in the scikit. For numerical
        reasons, using alpha = 0 with the Lasso object is not advised
        and you should prefer the LinearRegression object.

    l1_ratio : float
        The ElasticNet mixing parameter, with 0 <= l1_ratio <= 1. For
        l1_ratio = 0 the penalty is an L2 penalty. For l1_ratio = 1 it is an L1
        penalty.  For 0 < l1_ratio < 1, the penalty is a combination of L1 and
        L2.

    fit_intercept: bool
        Whether the intercept should be estimated or not. If False, the
        data is assumed to be already centered.

    normalize : boolean, optional, default False
        If True, the regressors X will be normalized before regression.

    precompute : True | False | 'auto' | array-like
        Whether to use a precomputed Gram matrix to speed up
        calculations. If set to 'auto' let us decide. The Gram
        matrix can also be passed as argument. For sparse input
        this option is always True to preserve sparsity.

    max_iter: int, optional
        The maximum number of iterations

    copy_X : boolean, optional, default False
        If True, X will be copied; else, it may be overwritten.

    tol: float, optional
        The tolerance for the optimization: if the updates are
        smaller than 'tol', the optimization code checks the
        dual gap for optimality and continues until it is smaller
        than tol.

    warm_start : bool, optional
        When set to True, reuse the solution of the previous call to fit as
        initialization, otherwise, just erase the previous solution.

    positive: bool, optional
        When set to True, forces the coefficients to be positive.

    Attributes
    ----------
    `coef_` : array, shape = (n_features,)
        parameter vector (w in the cost function formula)

    `sparse_coef_` : scipy.sparse matrix, shape = (n_features, 1)
        `sparse_coef_` is a readonly property derived from `coef_`

    `intercept_` : float | array, shape = (n_targets,)
        independent term in decision function.

    `dual_gap_` : float
        the current fit is guaranteed to be epsilon-suboptimal with
        epsilon := `dual_gap_`

    `eps_` : float
        `eps_` is used to check if the fit converged to the requested
        `tol`

    Notes
    -----
    To avoid unnecessary memory duplication the X argument of the fit method
    should be directly passed as a fortran contiguous numpy array.
    """
    def __init__(self, alpha=1.0, l1_ratio=0.5, fit_intercept=True,
                 normalize=False, precompute='auto', max_iter=1000,
                 copy_X=True, tol=1e-4, warm_start=False, positive=False,
                 rho=None):
        self.alpha = alpha
        self.l1_ratio = l1_ratio
        if rho is not None:
            self.l1_ratio = rho
            warnings.warn("rho was renamed to l1_ratio and will be removed "
                          "in 0.15", DeprecationWarning)
        self.coef_ = None
        self.fit_intercept = fit_intercept
        self.normalize = normalize
        self.precompute = precompute
        self.max_iter = max_iter
        self.copy_X = copy_X
        self.tol = tol
        self.warm_start = warm_start
        self.positive = positive
        self.intercept_ = 0.0

    def fit(self, X, y, Xy=None, coef_init=None):
        """Fit model with coordinate descent

        Parameters
        -----------
        X: ndarray or scipy.sparse matrix, (n_samples, n_features)
            Data
        y: ndarray, shape = (n_samples,) or (n_samples, n_targets)
            Target
        Xy : array-like, optional
            Xy = np.dot(X.T, y) that can be precomputed. It is useful
            only when the Gram matrix is precomputed.
        coef_init: ndarray of shape n_features or (n_targets, n_features)
            The initial coeffients to warm-start the optimization

        Notes
        -----

        Coordinate descent is an algorithm that considers each column of
        data at a time hence it will automatically convert the X input
        as a fortran contiguous numpy array if necessary.

        To avoid memory re-allocation it is advised to allocate the
        initial data in memory directly using that format.
        """
        if self.alpha == 0:
            warnings.warn("With alpha=0, this algorithm does not converge "
                          "well. You are advised to use the LinearRegression "
                          "estimator", stacklevel=2)
        X = atleast2d_or_csc(X, dtype=np.float64, order='F',
                             copy=self.copy_X and self.fit_intercept)
        # From now on X can be touched inplace
        y = np.asarray(y, dtype=np.float64)
        # now all computation with X can be done inplace
        fit = self._sparse_fit if sparse.isspmatrix(X) else self._dense_fit
        fit(X, y, Xy, coef_init)
        return self

    def _dense_fit(self, X, y, Xy=None, coef_init=None):

        # copy was done in fit if necessary
        X, y, X_mean, y_mean, X_std = center_data(
            X, y, self.fit_intercept, self.normalize, copy=False)

        if y.ndim == 1:
            y = y[:, np.newaxis]
        if Xy is not None and Xy.ndim == 1:
            Xy = Xy[:, np.newaxis]

        n_samples, n_features = X.shape
        n_targets = y.shape[1]

        precompute = self.precompute
        if hasattr(precompute, '__array__') \
                and not np.allclose(X_mean, np.zeros(n_features)) \
                and not np.allclose(X_std, np.ones(n_features)):
            # recompute Gram
            precompute = 'auto'
            Xy = None

        coef_ = self._init_coef(coef_init, n_features, n_targets)
        dual_gap_ = np.empty(n_targets)
        eps_ = np.empty(n_targets)

        l1_reg = self.alpha * self.l1_ratio * n_samples
        l2_reg = self.alpha * (1.0 - self.l1_ratio) * n_samples

        # precompute if n_samples > n_features
        if hasattr(precompute, '__array__'):
            Gram = precompute
        elif precompute or (precompute == 'auto' and n_samples > n_features):
            Gram = np.dot(X.T, X)
        else:
            Gram = None

        for k in xrange(n_targets):
            if Gram is None:
                coef_[k, :], dual_gap_[k], eps_[k] = \
                    cd_fast.enet_coordinate_descent(
                        coef_[k, :], l1_reg, l2_reg, X, y[:, k], self.max_iter,
                        self.tol, self.positive)
            else:
                Gram = Gram.copy()
                if Xy is None:
                    this_Xy = np.dot(X.T, y[:, k])
                else:
                    this_Xy = Xy[:, k]
                coef_[k, :], dual_gap_[k], eps_[k] = \
                    cd_fast.enet_coordinate_descent_gram(
                        coef_[k, :], l1_reg, l2_reg, Gram, this_Xy, y[:, k],
                        self.max_iter, self.tol, self.positive)

            if dual_gap_[k] > eps_[k]:
                warnings.warn('Objective did not converge for ' +
                              'target %d, you might want' % k +
                              ' to increase the number of iterations')

        self.coef_, self.dual_gap_, self.eps_ = (np.squeeze(a) for a in
                                                 (coef_, dual_gap_, eps_))
        self._set_intercept(X_mean, y_mean, X_std)

        # return self for chaining fit and predict calls
        return self

    def _sparse_fit(self, X, y, Xy=None, coef_init=None):

        if X.shape[0] != y.shape[0]:
            raise ValueError("X and y have incompatible shapes.\n" +
                             "Note: Sparse matrices cannot be indexed w/" +
                             "boolean masks (use `indices=True` in CV).")

        # NOTE: we are explicitly not centering the data the naive way to
        # avoid breaking the sparsity of X
        X_data, y, X_mean, y_mean, X_std = sparse_center_data(
            X, y, self.fit_intercept, self.normalize)

        if y.ndim == 1:
            y = y[:, np.newaxis]

        n_samples, n_features = X.shape[0], X.shape[1]
        n_targets = y.shape[1]

        coef_ = self._init_coef(coef_init, n_features, n_targets)
        dual_gap_ = np.empty(n_targets)
        eps_ = np.empty(n_targets)

        l1_reg = self.alpha * self.l1_ratio * n_samples
        l2_reg = self.alpha * (1.0 - self.l1_ratio) * n_samples

        for k in xrange(n_targets):
            coef_[k, :], dual_gap_[k], eps_[k] = \
                cd_fast.sparse_enet_coordinate_descent(
                    coef_[k, :], l1_reg, l2_reg, X_data, X.indices,
                    X.indptr, y[:, k], X_mean / X_std,
                    self.max_iter, self.tol, self.positive)

            if dual_gap_[k] > eps_[k]:
                warnings.warn('Objective did not converge for ' +
                              'target %d, you might want' % k +
                              ' to increase the number of iterations')

        self.coef_, self.dual_gap_, self.eps_ = (np.squeeze(a) for a in
                                                 (coef_, dual_gap_, eps_))
        self._set_intercept(X_mean, y_mean, X_std)

        # return self for chaining fit and predict calls
        return self

    def _init_coef(self, coef_init, n_features, n_targets):
        if coef_init is None:
            if not self.warm_start or self.coef_ is None:
                coef_ = np.zeros((n_targets, n_features), dtype=np.float64)
            else:
                coef_ = self.coef_
        else:
            coef_ = coef_init

        if coef_.ndim == 1:
            coef_ = coef_[np.newaxis, :]
        if coef_.shape != (n_targets, n_features):
            raise ValueError("X and coef_init have incompatible "
                             "shapes (%s != %s)."
                             % (coef_.shape, (n_targets, n_features)))

        return coef_

    @property
    def sparse_coef_(self):
        """ sparse representation of the fitted coef """
        return sparse.csr_matrix(self.coef_)

    def decision_function(self, X):
        """Decision function of the linear model

        Parameters
        ----------
        X : numpy array or scipy.sparse matrix of shape (n_samples, n_features)

        Returns
        -------
        T : array, shape = (n_samples,)
            The predicted decision function
        """
        if sparse.isspmatrix(X):
            return np.ravel(safe_sparse_dot(self.coef_, X.T, dense_output=True)
                            + self.intercept_)
        else:
            return super(ElasticNet, self).decision_function(X)


###############################################################################
# Lasso model

class Lasso(ElasticNet):
    """Linear Model trained with L1 prior as regularizer (aka the Lasso)

    The optimization objective for Lasso is::

        (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1

    Technically the Lasso model is optimizing the same objective function as
    the Elastic Net with l1_ratio=1.0 (no L2 penalty).

    Parameters
    ----------
    alpha : float, optional
        Constant that multiplies the L1 term. Defaults to 1.0
        alpha = 0 is equivalent to an ordinary least square, solved
        by the LinearRegression object in the scikit. For numerical
        reasons, using alpha = 0 is with the Lasso object is not advised
        and you should prefer the LinearRegression object.

    fit_intercept : boolean
        whether to calculate the intercept for this model. If set
        to false, no intercept will be used in calculations
        (e.g. data is expected to be already centered).

    normalize : boolean, optional, default False
        If True, the regressors X will be normalized before regression.

    copy_X : boolean, optional, default True
        If True, X will be copied; else, it may be overwritten.

    precompute : True | False | 'auto' | array-like
        Whether to use a precomputed Gram matrix to speed up
        calculations. If set to 'auto' let us decide. The Gram
        matrix can also be passed as argument. For sparse input
        this option is always True to preserve sparsity.

    max_iter: int, optional
        The maximum number of iterations

    tol : float, optional
        The tolerance for the optimization: if the updates are
        smaller than 'tol', the optimization code checks the
        dual gap for optimality and continues until it is smaller
        than tol.

    warm_start : bool, optional
        When set to True, reuse the solution of the previous call to fit as
        initialization, otherwise, just erase the previous solution.

    positive : bool, optional
        When set to True, forces the coefficients to be positive.


    Attributes
    ----------
    `coef_` : array, shape = (n_features,)
        parameter vector (w in the cost function formula)

    `sparse_coef_` : scipy.sparse matrix, shape = (n_features, 1)
        `sparse_coef_` is a readonly property derived from `coef_`

    `intercept_` : float
        independent term in decision function.

    `dual_gap_` : float
        the current fit is guaranteed to be epsilon-suboptimal with
        epsilon := `dual_gap_`

    `eps_` : float
        `eps_` is used to check if the fit converged to the requested
        `tol`

    Examples
    --------
    >>> from sklearn import linear_model
    >>> clf = linear_model.Lasso(alpha=0.1)
    >>> clf.fit([[0,0], [1, 1], [2, 2]], [0, 1, 2])
    Lasso(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=1000,
       normalize=False, positive=False, precompute='auto', tol=0.0001,
       warm_start=False)
    >>> print(clf.coef_)
    [ 0.85  0.  ]
    >>> print(clf.intercept_)
    0.15

    See also
    --------
    lars_path
    lasso_path
    LassoLars
    LassoCV
    LassoLarsCV
    sklearn.decomposition.sparse_encode

    Notes
    -----
    The algorithm used to fit the model is coordinate descent.

    To avoid unnecessary memory duplication the X argument of the fit method
    should be directly passed as a fortran contiguous numpy array.
    """

    def __init__(self, alpha=1.0, fit_intercept=True, normalize=False,
                 precompute='auto', copy_X=True, max_iter=1000,
                 tol=1e-4, warm_start=False, positive=False):
        super(Lasso, self).__init__(
            alpha=alpha, l1_ratio=1.0, fit_intercept=fit_intercept,
            normalize=normalize, precompute=precompute, copy_X=copy_X,
            max_iter=max_iter, tol=tol, warm_start=warm_start,
            positive=positive)


###############################################################################
# Classes to store linear models along a regularization path

def lasso_path(X, y, eps=1e-3, n_alphas=100, alphas=None,
               precompute='auto', Xy=None, fit_intercept=True,
               normalize=False, copy_X=True, verbose=False,
               **params):
    """Compute Lasso path with coordinate descent

    The optimization objective for Lasso is::

        (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1

    Parameters
    ----------
    X : ndarray, shape = (n_samples, n_features)
        Training data. Pass directly as fortran contiguous data to avoid
        unnecessary memory duplication

    y : ndarray, shape = (n_samples,)
        Target values

    eps : float, optional
        Length of the path. eps=1e-3 means that
        alpha_min / alpha_max = 1e-3

    n_alphas : int, optional
        Number of alphas along the regularization path

    alphas : ndarray, optional
        List of alphas where to compute the models.
        If None alphas are set automatically

    precompute : True | False | 'auto' | array-like
        Whether to use a precomputed Gram matrix to speed up
        calculations. If set to 'auto' let us decide. The Gram
        matrix can also be passed as argument.

    Xy : array-like, optional
        Xy = np.dot(X.T, y) that can be precomputed. It is useful
        only when the Gram matrix is precomputed.

    fit_intercept : bool
        Fit or not an intercept

    normalize : boolean, optional, default False
        If True, the regressors X will be normalized before regression.

    copy_X : boolean, optional, default True
        If True, X will be copied; else, it may be overwritten.

    verbose : bool or integer
        Amount of verbosity

    params : kwargs
        keyword arguments passed to the Lasso objects

    Returns
    -------
    models : a list of models along the regularization path

    Notes
    -----
    See examples/linear_model/plot_lasso_coordinate_descent_path.py
    for an example.

    To avoid unnecessary memory duplication the X argument of the fit method
    should be directly passed as a fortran contiguous numpy array.

    Note that in certain cases, the Lars solver may be significantly
    faster to implement this functionality. In particular, linear
    interpolation can be used to retrieve model coefficents between the
    values output by lars_path

    Examples
    ---------

    Comparing lasso_path and lars_path with interpolation:

    >>> X = np.array([[1, 2, 3.1], [2.3, 5.4, 4.3]]).T
    >>> y = np.array([1, 2, 3.1])
    >>> # Use lasso_path to compute a coefficient path
    >>> coef_path = [e.coef_ for e in lasso_path(X, y, alphas=[5., 1., .5], fit_intercept=False)]
    >>> print np.array(coef_path).T
    [[ 0.          0.          0.46874778]
     [ 0.2159048   0.4425765   0.23689075]]

    >>> # Now use lars_path and 1D linear interpolation to compute the
    >>> # same path
    >>> from sklearn.linear_model import lars_path
    >>> alphas, active, coef_path_lars = lars_path(X, y, method='lasso')
    >>> from scipy import interpolate
    >>> coef_path_continuous = interpolate.interp1d(alphas[::-1], coef_path_lars[:, ::-1])
    >>> print coef_path_continuous([5., 1., .5])
    [[ 0.          0.          0.46915237]
     [ 0.2159048   0.4425765   0.23668876]]


    See also
    --------
    lars_path
    Lasso
    LassoLars
    LassoCV
    LassoLarsCV
    sklearn.decomposition.sparse_encode
    """
    return enet_path(X, y, l1_ratio=1., eps=eps, n_alphas=n_alphas,
                     alphas=alphas, precompute=precompute, Xy=Xy,
                     fit_intercept=fit_intercept, normalize=normalize,
                     copy_X=copy_X, verbose=verbose, **params)


def enet_path(X, y, l1_ratio=0.5, eps=1e-3, n_alphas=100, alphas=None,
              precompute='auto', Xy=None, fit_intercept=True,
              normalize=False, copy_X=True, verbose=False, rho=None,
              **params):
    """Compute Elastic-Net path with coordinate descent

    The Elastic Net optimization function is::

        1 / (2 * n_samples) * ||y - Xw||^2_2 +
        + alpha * l1_ratio * ||w||_1
        + 0.5 * alpha * (1 - l1_ratio) * ||w||^2_2

    Parameters
    ----------
    X : ndarray, shape = (n_samples, n_features)
        Training data. Pass directly as fortran contiguous data to avoid
        unnecessary memory duplication

    y : ndarray, shape = (n_samples,)
        Target values

    l1_ratio : float, optional
        float between 0 and 1 passed to ElasticNet (scaling between
        l1 and l2 penalties). l1_ratio=1 corresponds to the Lasso

    eps : float
        Length of the path. eps=1e-3 means that
        alpha_min / alpha_max = 1e-3

    n_alphas : int, optional
        Number of alphas along the regularization path

    alphas : ndarray, optional
        List of alphas where to compute the models.
        If None alphas are set automatically

    precompute : True | False | 'auto' | array-like
        Whether to use a precomputed Gram matrix to speed up
        calculations. If set to 'auto' let us decide. The Gram
        matrix can also be passed as argument.

    Xy : array-like, optional
        Xy = np.dot(X.T, y) that can be precomputed. It is useful
        only when the Gram matrix is precomputed.

    fit_intercept : bool
        Fit or not an intercept

    normalize : boolean, optional, default False
        If True, the regressors X will be normalized before regression.

    copy_X : boolean, optional, default True
        If True, X will be copied; else, it may be overwritten.

    verbose : bool or integer
        Amount of verbosity

    params : kwargs
        keyword arguments passed to the Lasso objects

    Returns
    -------
    models : a list of models along the regularization path

    Notes
    -----
    See examples/plot_lasso_coordinate_descent_path.py for an example.

    See also
    --------
    ElasticNet
    ElasticNetCV
    """

    if rho is not None:
        l1_ratio = rho
        warnings.warn("rho was renamed to l1_ratio and will be removed "
                      "in 0.15", DeprecationWarning)

    X = atleast2d_or_csc(X, dtype=np.float64, order='F',
                         copy=copy_X and fit_intercept)
    # From now on X can be touched inplace
    if not sparse.isspmatrix(X):
        X, y, X_mean, y_mean, X_std = center_data(X, y, fit_intercept,
                                                  normalize, copy=False)
        # XXX : in the sparse case the data will be centered
        # at each fit...

    n_samples, n_features = X.shape

    if (hasattr(precompute, '__array__')
            and not np.allclose(X_mean, np.zeros(n_features))
            and not np.allclose(X_std, np.ones(n_features))):
        # recompute Gram
        precompute = 'auto'
        Xy = None

    if precompute or ((precompute == 'auto') and (n_samples > n_features)):
        if sparse.isspmatrix(X):
            warnings.warn("precompute is ignored for sparse data")
            precompute = False
        else:
            precompute = np.dot(X.T, X)

    if Xy is None:
        Xy = safe_sparse_dot(X.T, y, dense_output=True)

    n_samples = X.shape[0]
    if alphas is None:
        alpha_max = np.abs(Xy).max() / (n_samples * l1_ratio)
        alphas = np.logspace(np.log10(alpha_max * eps), np.log10(alpha_max),
                             num=n_alphas)[::-1]
    else:
        alphas = np.sort(alphas)[::-1]  # make sure alphas are properly ordered
    coef_ = None  # init coef_
    models = []

    n_alphas = len(alphas)
    for i, alpha in enumerate(alphas):
        model = ElasticNet(
            alpha=alpha, l1_ratio=l1_ratio,
            fit_intercept=fit_intercept if sparse.isspmatrix(X) else False,
            precompute=precompute)
        model.set_params(**params)
        model.fit(X, y, coef_init=coef_, Xy=Xy)
        if fit_intercept and not sparse.isspmatrix(X):
            model.fit_intercept = True
            model._set_intercept(X_mean, y_mean, X_std)
        if verbose:
            if verbose > 2:
                print model
            elif verbose > 1:
                print 'Path: %03i out of %03i' % (i, n_alphas)
            else:
                sys.stderr.write('.')
        coef_ = model.coef_.copy()
        models.append(model)
    return models


def _path_residuals(X, y, train, test, path, path_params, l1_ratio=1):
    this_mses = list()
    if 'l1_ratio' in path_params:
        path_params['l1_ratio'] = l1_ratio
    models_train = path(X[train], y[train], **path_params)
    this_mses = np.empty(len(models_train))
    for i_model, model in enumerate(models_train):
        y_ = model.predict(X[test])
        this_mses[i_model] = ((y_ - y[test]) ** 2).mean()
    return this_mses, l1_ratio


class LinearModelCV(LinearModel):
    """Base class for iterative model fitting along a regularization path"""
    __metaclass__ = ABCMeta

    @abstractmethod
    def __init__(self, eps=1e-3, n_alphas=100, alphas=None, fit_intercept=True,
                 normalize=False, precompute='auto', max_iter=1000, tol=1e-4,
                 copy_X=True, cv=None, verbose=False):
        self.eps = eps
        self.n_alphas = n_alphas
        self.alphas = alphas
        self.fit_intercept = fit_intercept
        self.normalize = normalize
        self.precompute = precompute
        self.max_iter = max_iter
        self.tol = tol
        self.copy_X = copy_X
        self.cv = cv
        self.verbose = verbose

    def fit(self, X, y):
        """Fit linear model with coordinate descent

        Fit is on grid of alphas and best alpha estimated by cross-validation.

        Parameters
        ----------

        X : array-like, shape (n_samples, n_features)
            Training data. Pass directly as fortran contiguous data to avoid
            unnecessary memory duplication

        y : narray, shape (n_samples,) or (n_samples, n_targets)
            Target values

        """
        X = atleast2d_or_csc(X, dtype=np.float64, order='F',
                             copy=self.copy_X and self.fit_intercept)
        # From now on X can be touched inplace
        y = np.asarray(y, dtype=np.float64)
        if X.shape[0] != y.shape[0]:
            raise ValueError("X and y have inconsistent dimensions (%d != %d)"
                             % (X.shape[0], y.shape[0]))

        # All LinearModelCV parameters except 'cv' are acceptable
        path_params = self.get_params()
        if 'l1_ratio' in path_params:
            l1_ratios = np.atleast_1d(path_params['l1_ratio'])
            # For the first path, we need to set l1_ratio
            path_params['l1_ratio'] = l1_ratios[0]
        else:
            l1_ratios = [1, ]
        path_params.pop('cv', None)
        path_params.pop('n_jobs', None)

        # Start to compute path on full data
        # XXX: is this really useful: we are fitting models that we won't
        # use later
        models = self.path(X, y, **path_params)

        # Update the alphas list
        alphas = [model.alpha for model in models]
        n_alphas = len(alphas)
        path_params.update({'alphas': alphas, 'n_alphas': n_alphas})

        # init cross-validation generator
        cv = check_cv(self.cv, X)

        # Compute path for all folds and compute MSE to get the best alpha
        folds = list(cv)
        best_mse = np.inf
        all_mse_paths = list()

        # We do a double for loop folded in one, in order to be able to
        # iterate in parallel on l1_ratio and folds
        for l1_ratio, mse_alphas in itertools.groupby(
                Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(
                    delayed(_path_residuals)(
                        X, y, train, test, self.path, path_params,
                        l1_ratio=l1_ratio)
                    for l1_ratio in l1_ratios for train, test in folds
                ), operator.itemgetter(1)):

            mse_alphas = [m[0] for m in mse_alphas]
            mse_alphas = np.array(mse_alphas)
            mse = np.mean(mse_alphas, axis=0)
            i_best_alpha = np.argmin(mse)
            this_best_mse = mse[i_best_alpha]
            all_mse_paths.append(mse_alphas.T)
            if this_best_mse < best_mse:
                model = models[i_best_alpha]
                best_l1_ratio = l1_ratio

        if hasattr(model, 'l1_ratio'):
            if model.l1_ratio != best_l1_ratio:
                # Need to refit the model
                model.l1_ratio = best_l1_ratio
                model.fit(X, y)
            self.l1_ratio_ = model.l1_ratio
        self.coef_ = model.coef_
        self.intercept_ = model.intercept_
        self.alpha_ = model.alpha
        self.alphas_ = np.asarray(alphas)
        self.coef_path_ = np.asarray([model.coef_ for model in models])
        self.mse_path_ = np.squeeze(all_mse_paths)
        return self

    @property
    def rho_(self):
        warnings.warn("rho was renamed to ``l1_ratio_`` and will be removed "
                      "in 0.15", DeprecationWarning)
        return self.l1_ratio_

    @property
    def alpha(self):
        warnings.warn("Use alpha_. Using alpha is deprecated "
                      "since version 0.12, and backward compatibility "
                      "won't be maintained from version 0.14 onward. ",
                      DeprecationWarning, stacklevel=1)
        return self.alpha_


class LassoCV(LinearModelCV, RegressorMixin):
    """Lasso linear model with iterative fitting along a regularization path

    The best model is selected by cross-validation.

    The optimization objective for Lasso is::

        (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1

    Parameters
    ----------
    eps : float, optional
        Length of the path. eps=1e-3 means that
        alpha_min / alpha_max = 1e-3.

    n_alphas : int, optional
        Number of alphas along the regularization path

    alphas : numpy array, optional
        List of alphas where to compute the models.
        If None alphas are set automatically

    precompute : True | False | 'auto' | array-like
        Whether to use a precomputed Gram matrix to speed up
        calculations. If set to 'auto' let us decide. The Gram
        matrix can also be passed as argument.

    max_iter: int, optional
        The maximum number of iterations

    tol: float, optional
        The tolerance for the optimization: if the updates are
        smaller than 'tol', the optimization code checks the
        dual gap for optimality and continues until it is smaller
        than tol.

    cv : integer or crossvalidation generator, optional
        If an integer is passed, it is the number of fold (default 3).
        Specific crossvalidation objects can be passed, see
        sklearn.cross_validation module for the list of possible objects

    verbose : bool or integer
        amount of verbosity

    Attributes
    ----------
    `alpha_`: float
        The amount of penalization choosen by cross validation

    `coef_` : array, shape = (n_features,)
        parameter vector (w in the cost function formula)

    `intercept_` : float
        independent term in decision function.

    `mse_path_`: array, shape = (n_alphas, n_folds)
        mean square error for the test set on each fold, varying alpha

    `alphas_`: numpy array
        The grid of alphas used for fitting

    Notes
    -----
    See examples/linear_model/lasso_path_with_crossvalidation.py
    for an example.

    To avoid unnecessary memory duplication the X argument of the fit method
    should be directly passed as a fortran contiguous numpy array.

    See also
    --------
    lars_path
    lasso_path
    LassoLars
    Lasso
    LassoLarsCV
    """
    path = staticmethod(lasso_path)
    n_jobs = 1

    def __init__(self, eps=1e-3, n_alphas=100, alphas=None, fit_intercept=True,
                 normalize=False, precompute='auto', max_iter=1000, tol=1e-4,
                 copy_X=True, cv=None, verbose=False):
        super(LassoCV, self).__init__(
            eps=eps, n_alphas=n_alphas, alphas=alphas,
            fit_intercept=fit_intercept, normalize=normalize,
            precompute=precompute, max_iter=max_iter, tol=tol, copy_X=copy_X,
            cv=cv, verbose=verbose)


class ElasticNetCV(LinearModelCV, RegressorMixin):
    """Elastic Net model with iterative fitting along a regularization path

    The best model is selected by cross-validation.

    Parameters
    ----------
    l1_ratio : float, optional
        float between 0 and 1 passed to ElasticNet (scaling between
        l1 and l2 penalties). For l1_ratio = 0
        the penalty is an L2 penalty. For l1_ratio = 1 it is an L1 penalty.
        For 0 < l1_ratio < 1, the penalty is a combination of L1 and L2
        This parameter can be a list, in which case the different
        values are tested by cross-validation and the one giving the best
        prediction score is used. Note that a good choice of list of
        values for l1_ratio is often to put more values close to 1
        (i.e. Lasso) and less close to 0 (i.e. Ridge), as in [.1, .5, .7,
        .9, .95, .99, 1]

    eps : float, optional
        Length of the path. eps=1e-3 means that
        alpha_min / alpha_max = 1e-3.

    n_alphas : int, optional
        Number of alphas along the regularization path

    alphas : numpy array, optional
        List of alphas where to compute the models.
        If None alphas are set automatically

    precompute : True | False | 'auto' | array-like
        Whether to use a precomputed Gram matrix to speed up
        calculations. If set to 'auto' let us decide. The Gram
        matrix can also be passed as argument.

    max_iter : int, optional
        The maximum number of iterations

    tol : float, optional
        The tolerance for the optimization: if the updates are
        smaller than 'tol', the optimization code checks the
        dual gap for optimality and continues until it is smaller
        than tol.

    cv : integer or crossvalidation generator, optional
        If an integer is passed, it is the number of fold (default 3).
        Specific crossvalidation objects can be passed, see
        sklearn.cross_validation module for the list of possible objects

    verbose : bool or integer
        amount of verbosity

    n_jobs : integer, optional
        Number of CPUs to use during the cross validation. If '-1', use
        all the CPUs. Note that this is used only if multiple values for
        l1_ratio are given.

    Attributes
    ----------
    `alpha_` : float
        The amount of penalization choosen by cross validation

    `l1_ratio_` : float
        The compromise between l1 and l2 penalization choosen by
        cross validation

    `coef_` : array, shape = (n_features,)
        Parameter vector (w in the cost function formula),

    `intercept_` : float
        Independent term in the decision function.

    `mse_path_` : array, shape = (n_l1_ratio, n_alpha, n_folds)
        Mean square error for the test set on each fold, varying l1_ratio and
        alpha.

    Notes
    -----
    See examples/linear_model/lasso_path_with_crossvalidation.py
    for an example.

    To avoid unnecessary memory duplication the X argument of the fit method
    should be directly passed as a fortran contiguous numpy array.

    The parameter l1_ratio corresponds to alpha in the glmnet R package
    while alpha corresponds to the lambda parameter in glmnet.
    More specifically, the optimization objective is::

        1 / (2 * n_samples) * ||y - Xw||^2_2 +
        + alpha * l1_ratio * ||w||_1
        + 0.5 * alpha * (1 - l1_ratio) * ||w||^2_2

    If you are interested in controlling the L1 and L2 penalty
    separately, keep in mind that this is equivalent to::

        a * L1 + b * L2

    for::

        alpha = a + b and l1_ratio = a / (a + b).

    See also
    --------
    enet_path
    ElasticNet

    """
    path = staticmethod(enet_path)

    def __init__(self, l1_ratio=0.5, eps=1e-3, n_alphas=100, alphas=None,
                 fit_intercept=True, normalize=False, precompute='auto',
                 max_iter=1000, tol=1e-4, cv=None, copy_X=True,
                 verbose=0, n_jobs=1, rho=None):
        self.l1_ratio = l1_ratio
        if rho is not None:
            self.l1_ratio = rho
            warnings.warn("rho was renamed to l1_ratio and will be removed "
                          "in 0.15", DeprecationWarning)
        self.eps = eps
        self.n_alphas = n_alphas
        self.alphas = alphas
        self.fit_intercept = fit_intercept
        self.normalize = normalize
        self.precompute = precompute
        self.max_iter = max_iter
        self.tol = tol
        self.cv = cv
        self.copy_X = copy_X
        self.verbose = verbose
        self.n_jobs = n_jobs

    @property
    @deprecated("rho was renamed to ``l1_ratio_`` and will be removed "
                "in 0.15")
    def rho(self):
        return self.l1_ratio_


###############################################################################
# Multi Task ElasticNet and Lasso models (with joint feature selection)

class MultiTaskElasticNet(Lasso):
    """Multi-task ElasticNet model trained with L1/L2 mixed-norm as regularizer

    The optimization objective for Lasso is::

        (1 / (2 * n_samples)) * ||Y - XW||^Fro_2
        + alpha * l1_ratio * ||W||_21
        + 0.5 * alpha * (1 - l1_ratio) * ||W||_Fro^2

    Where::

        ||W||_21 = \sum_i \sqrt{\sum_j w_{ij}^2}

    i.e. the sum of norm of earch row.

    Parameters
    ----------
    alpha : float, optional
        Constant that multiplies the L1/L2 term. Defaults to 1.0

    l1_ratio : float
        The ElasticNet mixing parameter, with 0 < l1_ratio <= 1.
        For l1_ratio = 0 the penalty is an L1/L2 penalty. For l1_ratio = 1 it
        is an L1 penalty.
        For 0 < l1_ratio < 1, the penalty is a combination of L1/L2 and L2.

    fit_intercept : boolean
        whether to calculate the intercept for this model. If set
        to false, no intercept will be used in calculations
        (e.g. data is expected to be already centered).

    normalize : boolean, optional, default False
        If True, the regressors X will be normalized before regression.

    copy_X : boolean, optional, default True
        If True, X will be copied; else, it may be overwritten.

    max_iter : int, optional
        The maximum number of iterations

    tol : float, optional
        The tolerance for the optimization: if the updates are
        smaller than 'tol', the optimization code checks the
        dual gap for optimality and continues until it is smaller
        than tol.

    warm_start : bool, optional
        When set to True, reuse the solution of the previous call to fit as
        initialization, otherwise, just erase the previous solution.

    Attributes
    ----------
    `intercept_` : array, shape = (n_tasks,)
        Independent term in decision function.

    `coef_` : array, shape = (n_tasks, n_features)
        Parameter vector (W in the cost function formula). If a 1D y is \
        passed in at fit (non multi-task usage), `coef_` is then a 1D array

    Examples
    --------
    >>> from sklearn import linear_model
    >>> clf = linear_model.MultiTaskElasticNet(alpha=0.1)
    >>> clf.fit([[0,0], [1, 1], [2, 2]], [[0, 0], [1, 1], [2, 2]])
    ... #doctest: +NORMALIZE_WHITESPACE
    MultiTaskElasticNet(alpha=0.1, copy_X=True, fit_intercept=True,
            l1_ratio=0.5, max_iter=1000, normalize=False, rho=None, tol=0.0001,
            warm_start=False)
    >>> print clf.coef_
    [[ 0.45663524  0.45612256]
     [ 0.45663524  0.45612256]]
    >>> print clf.intercept_
    [ 0.0872422  0.0872422]

    See also
    --------
    ElasticNet, MultiTaskLasso

    Notes
    -----
    The algorithm used to fit the model is coordinate descent.

    To avoid unnecessary memory duplication the X argument of the fit method
    should be directly passed as a fortran contiguous numpy array.
    """
    def __init__(self, alpha=1.0, l1_ratio=0.5, fit_intercept=True,
                 normalize=False, copy_X=True, max_iter=1000, tol=1e-4,
                 warm_start=False, rho=None):
        self.l1_ratio = l1_ratio
        if rho is not None:
            self.l1_ratio = rho
            warnings.warn("rho was renamed to l1_ratio and will be removed "
                          "in 0.15", DeprecationWarning)
        self.alpha = alpha
        self.coef_ = None
        self.fit_intercept = fit_intercept
        self.normalize = normalize
        self.max_iter = max_iter
        self.copy_X = copy_X
        self.tol = tol
        self.warm_start = warm_start

    def fit(self, X, y, Xy=None, coef_init=None):
        """Fit MultiTaskLasso model with coordinate descent

        Parameters
        -----------
        X: ndarray, shape = (n_samples, n_features)
            Data
        y: ndarray, shape = (n_samples, n_tasks)
            Target
        coef_init: ndarray of shape n_features
            The initial coeffients to warm-start the optimization

        Notes
        -----

        Coordinate descent is an algorithm that considers each column of
        data at a time hence it will automatically convert the X input
        as a fortran contiguous numpy array if necessary.

        To avoid memory re-allocation it is advised to allocate the
        initial data in memory directly using that format.
        """
        # X and y must be of type float64
        X = array2d(X, dtype=np.float64, order='F',
                    copy=self.copy_X and self.fit_intercept)
        y = np.asarray(y, dtype=np.float64)

        squeeze_me = False
        if y.ndim == 1:
            squeeze_me = True
            y = y[:, np.newaxis]

        n_samples, n_features = X.shape
        _, n_tasks = y.shape

        X, y, X_mean, y_mean, X_std = center_data(
            X, y, self.fit_intercept, self.normalize, copy=False)

        if coef_init is None:
            if not self.warm_start or self.coef_ is None:
                self.coef_ = np.zeros((n_tasks, n_features), dtype=np.float64,
                                      order='F')
        else:
            self.coef_ = coef_init

        l1_reg = self.alpha * self.l1_ratio * n_samples
        l2_reg = self.alpha * (1.0 - self.l1_ratio) * n_samples

        self.coef_ = np.asfortranarray(self.coef_)  # coef contiguous in memory

        self.coef_, self.dual_gap_, self.eps_ = \
            cd_fast.enet_coordinate_descent_multi_task(
                self.coef_, l1_reg, l2_reg, X, y, self.max_iter, self.tol)

        self._set_intercept(X_mean, y_mean, X_std)

        # Make sure that the coef_ have the same shape as the given 'y',
        # to predict with the same shape
        if squeeze_me:
            self.coef_ = self.coef_.squeeze()

        if self.dual_gap_ > self.eps_:
            warnings.warn('Objective did not converge, you might want'
                          ' to increase the number of iterations')

        # return self for chaining fit and predict calls
        return self


class MultiTaskLasso(MultiTaskElasticNet):
    """Multi-task Lasso model trained with L1/L2 mixed-norm as regularizer

    The optimization objective for Lasso is::

        (1 / (2 * n_samples)) * ||Y - XW||^2_Fro + alpha * ||W||_21

    Where::

        ||W||_21 = \sum_i \sqrt{\sum_j w_{ij}^2}

    i.e. the sum of norm of earch row.

    Parameters
    ----------
    alpha : float, optional
        Constant that multiplies the L1/L2 term. Defaults to 1.0

    fit_intercept : boolean
        whether to calculate the intercept for this model. If set
        to false, no intercept will be used in calculations
        (e.g. data is expected to be already centered).

    normalize : boolean, optional, default False
        If True, the regressors X will be normalized before regression.

    copy_X : boolean, optional, default True
        If True, X will be copied; else, it may be overwritten.

    max_iter : int, optional
        The maximum number of iterations

    tol : float, optional
        The tolerance for the optimization: if the updates are
        smaller than 'tol', the optimization code checks the
        dual gap for optimality and continues until it is smaller
        than tol.

    warm_start : bool, optional
        When set to True, reuse the solution of the previous call to fit as
        initialization, otherwise, just erase the previous solution.

    Attributes
    ----------
    `coef_` : array, shape = (n_tasks, n_features)
        parameter vector (W in the cost function formula)

    `intercept_` : array, shape = (n_tasks,)
        independent term in decision function.

    Examples
    --------
    >>> from sklearn import linear_model
    >>> clf = linear_model.MultiTaskLasso(alpha=0.1)
    >>> clf.fit([[0,0], [1, 1], [2, 2]], [[0, 0], [1, 1], [2, 2]])
    MultiTaskLasso(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=1000,
            normalize=False, tol=0.0001, warm_start=False)
    >>> print clf.coef_
    [[ 0.89393398  0.        ]
     [ 0.89393398  0.        ]]
    >>> print clf.intercept_
    [ 0.10606602  0.10606602]

    See also
    --------
    Lasso, MultiTaskElasticNet

    Notes
    -----
    The algorithm used to fit the model is coordinate descent.

    To avoid unnecessary memory duplication the X argument of the fit method
    should be directly passed as a fortran contiguous numpy array.
    """
    def __init__(self, alpha=1.0, fit_intercept=True, normalize=False,
                 copy_X=True, max_iter=1000, tol=1e-4, warm_start=False):
        self.alpha = alpha
        self.coef_ = None
        self.fit_intercept = fit_intercept
        self.normalize = normalize
        self.max_iter = max_iter
        self.copy_X = copy_X
        self.tol = tol
        self.warm_start = warm_start
        self.l1_ratio = 1.0