lorentzenchr/test_glm.py Secret

## test_glm.py
# Authors: Christian Lorentzen <lorentzen.ch@gmail.com>
#
# License: BSD 3 clause

import itertools
import warnings
from functools import partial

import numpy as np
import pytest
import scipy
from numpy.testing import assert_allclose
from scipy import linalg
from scipy.optimize import minimize, root

from sklearn._loss import HalfBinomialLoss, HalfGammaLoss, HalfPoissonLoss, HalfTweedieLoss
from sklearn._loss.link import IdentityLink, LogLink
from sklearn.base import clone
from sklearn.datasets import make_low_rank_matrix, make_regression
from sklearn.exceptions import ConvergenceWarning
from sklearn.linear_model import (
    GammaRegressor,
    PoissonRegressor,
    Ridge,
    TweedieRegressor,
)
from sklearn.linear_model._glm import _GeneralizedLinearRegressor
from sklearn.linear_model._glm._newton_solver import NewtonCholeskySolver
from sklearn.linear_model._linear_loss import LinearModelLoss
from sklearn.metrics import d2_tweedie_score, mean_poisson_deviance
from sklearn.model_selection import train_test_split
from glum import GeneralizedLinearRegressor

SOLVERS = ["lbfgs", "irls-ls", "irls-cd"]


def _special_minimize(fun, grad, x, tol_NM, tol):
    # Find good starting point by Nelder-Mead
    res_NM = minimize(
        fun, x, method="Nelder-Mead", options={"xatol": tol_NM, "fatol": tol_NM}
    )
    # Now refine via root finding on the gradient of the function, which is
    # more precise than minimizing the function itself.
    res = root(
        grad,
        res_NM.x,
        method="lm",
        options={"ftol": tol, "xtol": tol, "gtol": tol},
    )
    return res.x


@pytest.fixture
def global_random_seed():
    return 42


@pytest.fixture(
    params=itertools.product(
        ["long", "wide"],
        [
            GeneralizedLinearRegressor(family="binomial"),
            GeneralizedLinearRegressor(family="poisson"),
            GeneralizedLinearRegressor(family="gamma"),
            GeneralizedLinearRegressor(family="tweedie"),
        ],
    ),
    ids=lambda param: f"{param[0]}-{param[1]}",
)
def glm_dataset(global_random_seed, request):
    """Dataset with GLM solutions, well conditioned X.

    This is inspired by ols_ridge_dataset in test_ridge.py.

    The construction is based on the SVD decomposition of X = U S V'.

    Parameters
    ----------
    type : {"long", "wide"}
        If "long", then n_samples > n_features.
        If "wide", then n_features > n_samples.
    model : a GLM model

    For "wide", we return the minimum norm solution:

        min ||w||_2 subject to w = argmin deviance(X, y, w)

    Note that the deviance is always minimized if y = inverse_link(X w) is possible to
    achieve, which it is in the wide data case. Therefore, we can construct the
    solution with minimum norm like (wide) OLS:

        min ||w||_2 subject to link(y) = raw_prediction = X w

    Returns
    -------
    model : GLM model
    X : ndarray
        Last column of 1, i.e. intercept.
    y : ndarray
    coef_unpenalized : ndarray
        Minimum norm solutions, i.e. min sum(loss(w)) (with minimum ||w||_2 in
        case of ambiguity)
        Last coefficient is intercept.
    coef_penalized : ndarray
        GLM solution with alpha=l2_reg_strength=1, i.e.
        min 1/n * sum(loss) + ||w[:-1]||_2^2.
        Last coefficient is intercept.
    l2_reg_strength : float
        Always equal 1.
    """
    data_type, model = request.param
    # Make larger dim more than double as big as the smaller one.
    # This helps when constructing singular matrices like (X, X).
    if data_type == "long":
        n_samples, n_features = 12, 4
    else:
        n_samples, n_features = 4, 12
    k = min(n_samples, n_features)
    rng = np.random.RandomState(global_random_seed)
    X = make_low_rank_matrix(
        n_samples=n_samples,
        n_features=n_features,
        effective_rank=k,
        tail_strength=0.1,
        random_state=rng,
    )
    X[:, -1] = 1  # last columns acts as intercept
    U, s, Vt = linalg.svd(X, full_matrices=False)
    assert np.all(s > 1e-3)  # to be sure
    assert np.max(s) / np.min(s) < 100  # condition number of X

    if data_type == "long":
        coef_unpenalized = rng.uniform(low=1, high=3, size=n_features)
        coef_unpenalized *= rng.choice([-1, 1], size=n_features)
        raw_prediction = X @ coef_unpenalized
    else:
        raw_prediction = rng.uniform(low=-3, high=3, size=n_samples)
        # minimum norm solution min ||w||_2 such that raw_prediction = X w:
        # w = X'(XX')^-1 raw_prediction = V s^-1 U' raw_prediction
        coef_unpenalized = Vt.T @ np.diag(1 / s) @ U.T @ raw_prediction

    if model.family == "binomial":
        loss = HalfBinomialLoss()
    elif model.family == "poisson":
        loss = HalfPoissonLoss()
    elif model.family == "gamma":
        loss = HalfGammaLoss()
    elif model.family == "tweedie":
        loss = HalfTweedieLoss(power=1.5)
    linear_loss = LinearModelLoss(base_loss=loss, fit_intercept=True)
    sw = np.full(shape=n_samples, fill_value=1 / n_samples)
    y = linear_loss.base_loss.link.inverse(raw_prediction)

    # Add penalty l2_reg_strength * ||coef||_2^2 for l2_reg_strength=1 and solve with
    # optimizer. Note that the problem is well conditioned such that we get accurate
    # results.
    l2_reg_strength = 1
    fun = partial(
        linear_loss.loss,
        X=X[:, :-1],
        y=y,
        sample_weight=sw,
        l2_reg_strength=l2_reg_strength,
    )
    grad = partial(
        linear_loss.gradient,
        X=X[:, :-1],
        y=y,
        sample_weight=sw,
        l2_reg_strength=l2_reg_strength,
    )
    coef_penalized_with_intercept = _special_minimize(
        fun, grad, coef_unpenalized, tol_NM=1e-6, tol=1e-14
    )

    linear_loss = LinearModelLoss(base_loss=loss, fit_intercept=False)
    fun = partial(
        linear_loss.loss,
        X=X[:, :-1],
        y=y,
        sample_weight=sw,
        l2_reg_strength=l2_reg_strength,
    )
    grad = partial(
        linear_loss.gradient,
        X=X[:, :-1],
        y=y,
        sample_weight=sw,
        l2_reg_strength=l2_reg_strength,
    )
    coef_penalized_without_intercept = _special_minimize(
        fun, grad, coef_unpenalized[:-1], tol_NM=1e-6, tol=1e-14
    )

    # To be sure
    assert np.linalg.norm(coef_penalized_with_intercept) < np.linalg.norm(
        coef_unpenalized
    )

    return (
        model,
        X,
        y,
        coef_unpenalized,
        coef_penalized_with_intercept,
        coef_penalized_without_intercept,
        l2_reg_strength,
    )


@pytest.mark.parametrize("solver", SOLVERS)
@pytest.mark.parametrize("fit_intercept", [False, True])
def test_glm_regression(solver, fit_intercept, glm_dataset):
    """Test that GLM converges for all solvers to correct solution.

    We work with a simple constructed data set with known solution.
    """
    model, X, y, _, coef_with_intercept, coef_without_intercept, alpha = glm_dataset
    params = dict(
        alpha=alpha,
        fit_intercept=fit_intercept,
        solver=solver,
        gradient_tol=1e-12,
        max_iter=1000,
    )

    model = clone(model).set_params(**params)
    X = X[:, :-1]  # remove intercept
    if fit_intercept:
        coef = coef_with_intercept
        intercept = coef[-1]
        coef = coef[:-1]
    else:
        coef = coef_without_intercept
        intercept = 0

    model.fit(X, y)

    rtol = 5e-5 if solver == "lbfgs" else 1e-9
    assert model.intercept_ == pytest.approx(intercept, rel=rtol)
    assert_allclose(model.coef_, coef, rtol=rtol)

    # Same with sample_weight.
    model = (
        clone(model).set_params(**params).fit(X, y, sample_weight=np.ones(X.shape[0]))
    )
    assert model.intercept_ == pytest.approx(intercept, rel=rtol)
    assert_allclose(model.coef_, coef, rtol=rtol)


@pytest.mark.parametrize("solver", SOLVERS)
@pytest.mark.parametrize("fit_intercept", [True, False])
def test_glm_regression_hstacked_X(solver, fit_intercept, glm_dataset):
    """Test that GLM converges for all solvers to correct solution on hstacked data.

    We work with a simple constructed data set with known solution.
    Fit on [X] with alpha is the same as fit on [X, X]/2 with alpha/2.
    For long X, [X, X] is still a long but singular matrix.
    """
    model, X, y, _, coef_with_intercept, coef_without_intercept, alpha = glm_dataset
    n_samples, n_features = X.shape
    params = dict(
        alpha=alpha / 2,
        fit_intercept=fit_intercept,
        solver=solver,
        gradient_tol=1e-12,
        max_iter=1000,
    )

    model = clone(model).set_params(**params)
    X = X[:, :-1]  # remove intercept
    X = 0.5 * np.concatenate((X, X), axis=1)
    assert np.linalg.matrix_rank(X) <= min(n_samples, n_features - 1)
    if fit_intercept:
        coef = coef_with_intercept
        intercept = coef[-1]
        coef = coef[:-1]
    else:
        coef = coef_without_intercept
        intercept = 0

    with warnings.catch_warnings():
        # XXX: Investigate if the ConvergenceWarning that can appear in some
        # cases should be considered a bug or not. In the mean time we don't
        # fail when the assertions below pass irrespective of the presence of
        # the warning.
        warnings.simplefilter("ignore", ConvergenceWarning)
        model.fit(X, y)

    rtol = 2e-4 if solver == "lbfgs" else 5e-9
    assert model.intercept_ == pytest.approx(intercept, rel=rtol)
    assert_allclose(model.coef_, np.r_[coef, coef], rtol=rtol)


@pytest.mark.parametrize("solver", SOLVERS)
@pytest.mark.parametrize("fit_intercept", [True, False])
def test_glm_regression_vstacked_X(solver, fit_intercept, glm_dataset):
    """Test that GLM converges for all solvers to correct solution on vstacked data.

    We work with a simple constructed data set with known solution.
    Fit on [X] with alpha is the same as fit on [X], [y]
                                                [X], [y] with 1 * alpha.
    It is the same alpha as the average loss stays the same.
    For wide X, [X', X'] is a singular matrix.
    """
    model, X, y, _, coef_with_intercept, coef_without_intercept, alpha = glm_dataset
    n_samples, n_features = X.shape
    params = dict(
        alpha=alpha,
        fit_intercept=fit_intercept,
        solver=solver,
        gradient_tol=1e-12,
        max_iter=1000,
    )

    model = clone(model).set_params(**params)
    X = X[:, :-1]  # remove intercept
    X = np.concatenate((X, X), axis=0)
    assert np.linalg.matrix_rank(X) <= min(n_samples, n_features)
    y = np.r_[y, y]
    if fit_intercept:
        coef = coef_with_intercept
        intercept = coef[-1]
        coef = coef[:-1]
    else:
        coef = coef_without_intercept
        intercept = 0
    model.fit(X, y)

    rtol = 3e-5 if solver == "lbfgs" else 5e-9
    assert model.intercept_ == pytest.approx(intercept, rel=rtol)
    assert_allclose(model.coef_, coef, rtol=rtol)


@pytest.mark.parametrize("solver", SOLVERS)
@pytest.mark.parametrize("fit_intercept", [True, False])
def test_glm_regression_unpenalized(solver, fit_intercept, glm_dataset):
    """Test that unpenalized GLM converges for all solvers to correct solution.

    We work with a simple constructed data set with known solution.
    Note: This checks the minimum norm solution for wide X, i.e.
    n_samples < n_features:
        min ||w||_2 subject to w = argmin deviance(X, y, w)
    """
    model, X, y, coef, _, _, _ = glm_dataset
    n_samples, n_features = X.shape
    alpha = 0  # unpenalized
    params = dict(
        alpha=alpha,
        fit_intercept=fit_intercept,
        solver=solver,
        gradient_tol=1e-12,
        max_iter=1000,
    )

    model = clone(model).set_params(**params)
    if fit_intercept:
        X = X[:, :-1]  # remove intercept
        intercept = coef[-1]
        coef = coef[:-1]
    else:
        intercept = 0

    with warnings.catch_warnings():
        if solver.startswith("newton") and n_samples < n_features:
            # The newton solvers should warn and automatically fallback to LBFGS
            # in this case. The model should still converge.
            warnings.filterwarnings("ignore", category=scipy.linalg.LinAlgWarning)
        # XXX: Investigate if the ConvergenceWarning that can appear in some
        # cases should be considered a bug or not. In the mean time we don't
        # fail when the assertions below pass irrespective of the presence of
        # the warning.
        warnings.filterwarnings("ignore", category=ConvergenceWarning)
        model.fit(X, y)

    # FIXME: `assert_allclose(model.coef_, coef)` should work for all cases but fails
    # for the wide/fat case with n_features > n_samples. Most current GLM solvers do
    # NOT return the minimum norm solution with fit_intercept=True.
    if n_samples > n_features:
        rtol = 5e-5 if solver == "lbfgs" else 1e-7
        assert model.intercept_ == pytest.approx(intercept)
        assert_allclose(model.coef_, coef, rtol=rtol)
    else:
        # As it is an underdetermined problem, prediction = y. The following shows that
        # we get a solution, i.e. a (non-unique) minimum of the objective function ...
        rtol = 5e-5
        if solver == "newton-cholesky":
            rtol = 5e-4
        assert_allclose(model.predict(X), y, rtol=rtol)

        norm_solution = np.linalg.norm(np.r_[intercept, coef])
        norm_model = np.linalg.norm(np.r_[model.intercept_, model.coef_])
        if solver == "newton-cholesky":
            # XXX: This solver shows random behaviour. Sometimes it finds solutions
            # with norm_model <= norm_solution! So we check conditionally.
            if norm_model < (1 + 1e-12) * norm_solution:
                assert model.intercept_ == pytest.approx(intercept)
                assert_allclose(model.coef_, coef, rtol=rtol)
        elif solver == "lbfgs" and fit_intercept:
            # But it is not the minimum norm solution. Otherwise the norms would be
            # equal.
            assert norm_model > (1 + 1e-12) * norm_solution

            # See https://github.com/scikit-learn/scikit-learn/issues/23670.
            # Note: Even adding a tiny penalty does not give the minimal norm solution.
            # XXX: We could have naively expected LBFGS to find the minimal norm
            # solution by adding a very small penalty. Even that fails for a reason we
            # do not properly understand at this point.
        else:
            # When `fit_intercept=False`, LBFGS naturally converges to the minimum norm
            # solution on this problem.
            # XXX: Do we have any theoretical guarantees why this should be the case?
            assert model.intercept_ == pytest.approx(intercept, rel=rtol)
            assert_allclose(model.coef_, coef, rtol=rtol)


@pytest.mark.parametrize("solver", SOLVERS)
@pytest.mark.parametrize("fit_intercept", [True, False])
def test_glm_regression_unpenalized_hstacked_X(solver, fit_intercept, glm_dataset):
    """Test that unpenalized GLM converges for all solvers to correct solution.

    We work with a simple constructed data set with known solution.
    GLM fit on [X] is the same as fit on [X, X]/2.
    For long X, [X, X] is a singular matrix and we check against the minimum norm
    solution:
        min ||w||_2 subject to w = argmin deviance(X, y, w)
    """
    model, X, y, coef, _, _, _ = glm_dataset
    n_samples, n_features = X.shape
    alpha = 0  # unpenalized
    params = dict(
        alpha=alpha,
        fit_intercept=fit_intercept,
        solver=solver,
        gradient_tol=1e-12,
        max_iter=1000,
    )

    model = clone(model).set_params(**params)
    if fit_intercept:
        intercept = coef[-1]
        coef = coef[:-1]
        if n_samples > n_features:
            X = X[:, :-1]  # remove intercept
            X = 0.5 * np.concatenate((X, X), axis=1)
        else:
            # To know the minimum norm solution, we keep one intercept column and do
            # not divide by 2. Later on, we must take special care.
            X = np.c_[X[:, :-1], X[:, :-1], X[:, -1]]
    else:
        intercept = 0
        X = 0.5 * np.concatenate((X, X), axis=1)
    assert np.linalg.matrix_rank(X) <= min(n_samples, n_features)

    with warnings.catch_warnings():
        if solver.startswith("newton"):
            # The newton solvers should warn and automatically fallback to LBFGS
            # in this case. The model should still converge.
            warnings.filterwarnings("ignore", category=scipy.linalg.LinAlgWarning)
        # XXX: Investigate if the ConvergenceWarning that can appear in some
        # cases should be considered a bug or not. In the mean time we don't
        # fail when the assertions below pass irrespective of the presence of
        # the warning.
        warnings.filterwarnings("ignore", category=ConvergenceWarning)
        model.fit(X, y)

    if fit_intercept and n_samples < n_features:
        # Here we take special care.
        model_intercept = 2 * model.intercept_
        model_coef = 2 * model.coef_[:-1]  # exclude the other intercept term.
        # For minimum norm solution, we would have
        # assert model.intercept_ == pytest.approx(model.coef_[-1])
    else:
        model_intercept = model.intercept_
        model_coef = model.coef_

    if n_samples > n_features:
        assert model_intercept == pytest.approx(intercept)
        rtol = 1e-4
        assert_allclose(model_coef, np.r_[coef, coef], rtol=rtol)
    else:
        # As it is an underdetermined problem, prediction = y. The following shows that
        # we get a solution, i.e. a (non-unique) minimum of the objective function ...
        rtol = 1e-6 if solver == "lbfgs" else 5e-6
        assert_allclose(model.predict(X), y, rtol=rtol)
        if (solver == "lbfgs" and fit_intercept) or solver == "newton-cholesky":
            # Same as in test_glm_regression_unpenalized.
            # But it is not the minimum norm solution. Otherwise the norms would be
            # equal.
            norm_solution = np.linalg.norm(
                0.5 * np.r_[intercept, intercept, coef, coef]
            )
            norm_model = np.linalg.norm(np.r_[model.intercept_, model.coef_])
            assert norm_model > (1 + 1e-12) * norm_solution
            # For minimum norm solution, we would have
            # assert model.intercept_ == pytest.approx(model.coef_[-1])
        else:
            assert model_intercept == pytest.approx(intercept, rel=5e-6)
            assert_allclose(model_coef, np.r_[coef, coef], rtol=1e-4)


@pytest.mark.parametrize("solver", SOLVERS)
@pytest.mark.parametrize("fit_intercept", [True, False])
def test_glm_regression_unpenalized_vstacked_X(solver, fit_intercept, glm_dataset):
    """Test that unpenalized GLM converges for all solvers to correct solution.

    We work with a simple constructed data set with known solution.
    GLM fit on [X] is the same as fit on [X], [y]
                                         [X], [y].
    For wide X, [X', X'] is a singular matrix and we check against the minimum norm
    solution:
        min ||w||_2 subject to w = argmin deviance(X, y, w)
    """
    model, X, y, coef, _, _, _ = glm_dataset
    n_samples, n_features = X.shape
    alpha = 0  # unpenalized
    params = dict(
        alpha=alpha,
        fit_intercept=fit_intercept,
        solver=solver,
        gradient_tol=1e-12,
        max_iter=1000,
    )

    model = clone(model).set_params(**params)
    if fit_intercept:
        X = X[:, :-1]  # remove intercept
        intercept = coef[-1]
        coef = coef[:-1]
    else:
        intercept = 0
    X = np.concatenate((X, X), axis=0)
    assert np.linalg.matrix_rank(X) <= min(n_samples, n_features)
    y = np.r_[y, y]

    with warnings.catch_warnings():
        if solver.startswith("newton") and n_samples < n_features:
            # The newton solvers should warn and automatically fallback to LBFGS
            # in this case. The model should still converge.
            warnings.filterwarnings("ignore", category=scipy.linalg.LinAlgWarning)
        # XXX: Investigate if the ConvergenceWarning that can appear in some
        # cases should be considered a bug or not. In the mean time we don't
        # fail when the assertions below pass irrespective of the presence of
        # the warning.
        warnings.filterwarnings("ignore", category=ConvergenceWarning)
        model.fit(X, y)

    if n_samples > n_features:
        rtol = 5e-5 if solver == "lbfgs" else 1e-6
        assert model.intercept_ == pytest.approx(intercept)
        assert_allclose(model.coef_, coef, rtol=rtol)
    else:
        # As it is an underdetermined problem, prediction = y. The following shows that
        # we get a solution, i.e. a (non-unique) minimum of the objective function ...
        rtol = 1e-6 if solver == "lbfgs" else 5e-6
        assert_allclose(model.predict(X), y, rtol=rtol)

        norm_solution = np.linalg.norm(np.r_[intercept, coef])
        norm_model = np.linalg.norm(np.r_[model.intercept_, model.coef_])
        if solver == "newton-cholesky":
            # XXX: This solver shows random behaviour. Sometimes it finds solutions
            # with norm_model <= norm_solution! So we check conditionally.
            if not (norm_model > (1 + 1e-12) * norm_solution):
                assert model.intercept_ == pytest.approx(intercept)
                assert_allclose(model.coef_, coef, rtol=1e-4)
        elif solver == "lbfgs" and fit_intercept:
            # Same as in test_glm_regression_unpenalized.
            # But it is not the minimum norm solution. Otherwise the norms would be
            # equal.
            assert norm_model > (1 + 1e-12) * norm_solution
        else:
            rtol = 1e-5 if solver == "newton-cholesky" else 1e-4
            assert model.intercept_ == pytest.approx(intercept, rel=rtol)
            assert_allclose(model.coef_, coef, rtol=rtol)
	# Authors: Christian Lorentzen <lorentzen.ch@gmail.com>
	#
	# License: BSD 3 clause

	import itertools
	import warnings
	from functools import partial

	import numpy as np
	import pytest
	import scipy
	from numpy.testing import assert_allclose
	from scipy import linalg
	from scipy.optimize import minimize, root

	from sklearn._loss import HalfBinomialLoss, HalfGammaLoss, HalfPoissonLoss, HalfTweedieLoss
	from sklearn._loss.link import IdentityLink, LogLink
	from sklearn.base import clone
	from sklearn.datasets import make_low_rank_matrix, make_regression
	from sklearn.exceptions import ConvergenceWarning
	from sklearn.linear_model import (
	GammaRegressor,
	PoissonRegressor,
	Ridge,
	TweedieRegressor,
	)
	from sklearn.linear_model._glm import _GeneralizedLinearRegressor
	from sklearn.linear_model._glm._newton_solver import NewtonCholeskySolver
	from sklearn.linear_model._linear_loss import LinearModelLoss
	from sklearn.metrics import d2_tweedie_score, mean_poisson_deviance
	from sklearn.model_selection import train_test_split
	from glum import GeneralizedLinearRegressor

	SOLVERS = ["lbfgs", "irls-ls", "irls-cd"]


	def _special_minimize(fun, grad, x, tol_NM, tol):
	# Find good starting point by Nelder-Mead
	res_NM = minimize(
	fun, x, method="Nelder-Mead", options={"xatol": tol_NM, "fatol": tol_NM}
	)
	# Now refine via root finding on the gradient of the function, which is
	# more precise than minimizing the function itself.
	res = root(
	grad,
	res_NM.x,
	method="lm",
	options={"ftol": tol, "xtol": tol, "gtol": tol},
	)
	return res.x


	@pytest.fixture
	def global_random_seed():
	return 42


	@pytest.fixture(
	params=itertools.product(
	["long", "wide"],
	[
	GeneralizedLinearRegressor(family="binomial"),
	GeneralizedLinearRegressor(family="poisson"),
	GeneralizedLinearRegressor(family="gamma"),
	GeneralizedLinearRegressor(family="tweedie"),
	],
	),
	ids=lambda param: f"{param[0]}-{param[1]}",
	)
	def glm_dataset(global_random_seed, request):
	"""Dataset with GLM solutions, well conditioned X.

	This is inspired by ols_ridge_dataset in test_ridge.py.

	The construction is based on the SVD decomposition of X = U S V'.

	Parameters
	----------
	type : {"long", "wide"}
	If "long", then n_samples > n_features.
	If "wide", then n_features > n_samples.
	model : a GLM model

	For "wide", we return the minimum norm solution:

	min \|\|w\|\|_2 subject to w = argmin deviance(X, y, w)

	Note that the deviance is always minimized if y = inverse_link(X w) is possible to
	achieve, which it is in the wide data case. Therefore, we can construct the
	solution with minimum norm like (wide) OLS:

	min \|\|w\|\|_2 subject to link(y) = raw_prediction = X w

	Returns
	-------
	model : GLM model
	X : ndarray
	Last column of 1, i.e. intercept.
	y : ndarray
	coef_unpenalized : ndarray
	Minimum norm solutions, i.e. min sum(loss(w)) (with minimum \|\|w\|\|_2 in
	case of ambiguity)
	Last coefficient is intercept.
	coef_penalized : ndarray
	GLM solution with alpha=l2_reg_strength=1, i.e.
	min 1/n * sum(loss) + \|\|w[:-1]\|\|_2^2.
	Last coefficient is intercept.
	l2_reg_strength : float
	Always equal 1.
	"""
	data_type, model = request.param
	# Make larger dim more than double as big as the smaller one.
	# This helps when constructing singular matrices like (X, X).
	if data_type == "long":
	n_samples, n_features = 12, 4
	else:
	n_samples, n_features = 4, 12
	k = min(n_samples, n_features)
	rng = np.random.RandomState(global_random_seed)
	X = make_low_rank_matrix(
	n_samples=n_samples,
	n_features=n_features,
	effective_rank=k,
	tail_strength=0.1,
	random_state=rng,
	)
	X[:, -1] = 1 # last columns acts as intercept
	U, s, Vt = linalg.svd(X, full_matrices=False)
	assert np.all(s > 1e-3) # to be sure
	assert np.max(s) / np.min(s) < 100 # condition number of X

	if data_type == "long":
	coef_unpenalized = rng.uniform(low=1, high=3, size=n_features)
	coef_unpenalized *= rng.choice([-1, 1], size=n_features)
	raw_prediction = X @ coef_unpenalized
	else:
	raw_prediction = rng.uniform(low=-3, high=3, size=n_samples)
	# minimum norm solution min \|\|w\|\|_2 such that raw_prediction = X w:
	# w = X'(XX')^-1 raw_prediction = V s^-1 U' raw_prediction
	coef_unpenalized = Vt.T @ np.diag(1 / s) @ U.T @ raw_prediction

	if model.family == "binomial":
	loss = HalfBinomialLoss()
	elif model.family == "poisson":
	loss = HalfPoissonLoss()
	elif model.family == "gamma":
	loss = HalfGammaLoss()
	elif model.family == "tweedie":
	loss = HalfTweedieLoss(power=1.5)
	linear_loss = LinearModelLoss(base_loss=loss, fit_intercept=True)
	sw = np.full(shape=n_samples, fill_value=1 / n_samples)
	y = linear_loss.base_loss.link.inverse(raw_prediction)

	# Add penalty l2_reg_strength * \|\|coef\|\|_2^2 for l2_reg_strength=1 and solve with
	# optimizer. Note that the problem is well conditioned such that we get accurate
	# results.
	l2_reg_strength = 1
	fun = partial(
	linear_loss.loss,
	X=X[:, :-1],
	y=y,
	sample_weight=sw,
	l2_reg_strength=l2_reg_strength,
	)
	grad = partial(
	linear_loss.gradient,
	X=X[:, :-1],
	y=y,
	sample_weight=sw,
	l2_reg_strength=l2_reg_strength,
	)
	coef_penalized_with_intercept = _special_minimize(
	fun, grad, coef_unpenalized, tol_NM=1e-6, tol=1e-14
	)

	linear_loss = LinearModelLoss(base_loss=loss, fit_intercept=False)
	fun = partial(
	linear_loss.loss,
	X=X[:, :-1],
	y=y,
	sample_weight=sw,
	l2_reg_strength=l2_reg_strength,
	)
	grad = partial(
	linear_loss.gradient,
	X=X[:, :-1],
	y=y,
	sample_weight=sw,
	l2_reg_strength=l2_reg_strength,
	)
	coef_penalized_without_intercept = _special_minimize(
	fun, grad, coef_unpenalized[:-1], tol_NM=1e-6, tol=1e-14
	)

	# To be sure
	assert np.linalg.norm(coef_penalized_with_intercept) < np.linalg.norm(
	coef_unpenalized
	)

	return (
	model,
	X,
	y,
	coef_unpenalized,
	coef_penalized_with_intercept,
	coef_penalized_without_intercept,
	l2_reg_strength,
	)


	@pytest.mark.parametrize("solver", SOLVERS)
	@pytest.mark.parametrize("fit_intercept", [False, True])
	def test_glm_regression(solver, fit_intercept, glm_dataset):
	"""Test that GLM converges for all solvers to correct solution.

	We work with a simple constructed data set with known solution.
	"""
	model, X, y, _, coef_with_intercept, coef_without_intercept, alpha = glm_dataset
	params = dict(
	alpha=alpha,
	fit_intercept=fit_intercept,
	solver=solver,
	gradient_tol=1e-12,
	max_iter=1000,
	)

	model = clone(model).set_params(**params)
	X = X[:, :-1] # remove intercept
	if fit_intercept:
	coef = coef_with_intercept
	intercept = coef[-1]
	coef = coef[:-1]
	else:
	coef = coef_without_intercept
	intercept = 0

	model.fit(X, y)

	rtol = 5e-5 if solver == "lbfgs" else 1e-9
	assert model.intercept_ == pytest.approx(intercept, rel=rtol)
	assert_allclose(model.coef_, coef, rtol=rtol)

	# Same with sample_weight.
	model = (
	clone(model).set_params(**params).fit(X, y, sample_weight=np.ones(X.shape[0]))
	)
	assert model.intercept_ == pytest.approx(intercept, rel=rtol)
	assert_allclose(model.coef_, coef, rtol=rtol)


	@pytest.mark.parametrize("solver", SOLVERS)
	@pytest.mark.parametrize("fit_intercept", [True, False])
	def test_glm_regression_hstacked_X(solver, fit_intercept, glm_dataset):
	"""Test that GLM converges for all solvers to correct solution on hstacked data.

	We work with a simple constructed data set with known solution.
	Fit on [X] with alpha is the same as fit on [X, X]/2 with alpha/2.
	For long X, [X, X] is still a long but singular matrix.
	"""
	model, X, y, _, coef_with_intercept, coef_without_intercept, alpha = glm_dataset
	n_samples, n_features = X.shape
	params = dict(
	alpha=alpha / 2,
	fit_intercept=fit_intercept,
	solver=solver,
	gradient_tol=1e-12,
	max_iter=1000,
	)

	model = clone(model).set_params(**params)
	X = X[:, :-1] # remove intercept
	X = 0.5 * np.concatenate((X, X), axis=1)
	assert np.linalg.matrix_rank(X) <= min(n_samples, n_features - 1)
	if fit_intercept:
	coef = coef_with_intercept
	intercept = coef[-1]
	coef = coef[:-1]
	else:
	coef = coef_without_intercept
	intercept = 0

	with warnings.catch_warnings():
	# XXX: Investigate if the ConvergenceWarning that can appear in some
	# cases should be considered a bug or not. In the mean time we don't
	# fail when the assertions below pass irrespective of the presence of
	# the warning.
	warnings.simplefilter("ignore", ConvergenceWarning)
	model.fit(X, y)

	rtol = 2e-4 if solver == "lbfgs" else 5e-9
	assert model.intercept_ == pytest.approx(intercept, rel=rtol)
	assert_allclose(model.coef_, np.r_[coef, coef], rtol=rtol)


	@pytest.mark.parametrize("solver", SOLVERS)
	@pytest.mark.parametrize("fit_intercept", [True, False])
	def test_glm_regression_vstacked_X(solver, fit_intercept, glm_dataset):
	"""Test that GLM converges for all solvers to correct solution on vstacked data.

	We work with a simple constructed data set with known solution.
	Fit on [X] with alpha is the same as fit on [X], [y]
	[X], [y] with 1 * alpha.
	It is the same alpha as the average loss stays the same.
	For wide X, [X', X'] is a singular matrix.
	"""
	model, X, y, _, coef_with_intercept, coef_without_intercept, alpha = glm_dataset
	n_samples, n_features = X.shape
	params = dict(
	alpha=alpha,
	fit_intercept=fit_intercept,
	solver=solver,
	gradient_tol=1e-12,
	max_iter=1000,
	)

	model = clone(model).set_params(**params)
	X = X[:, :-1] # remove intercept
	X = np.concatenate((X, X), axis=0)
	assert np.linalg.matrix_rank(X) <= min(n_samples, n_features)
	y = np.r_[y, y]
	if fit_intercept:
	coef = coef_with_intercept
	intercept = coef[-1]
	coef = coef[:-1]
	else:
	coef = coef_without_intercept
	intercept = 0
	model.fit(X, y)

	rtol = 3e-5 if solver == "lbfgs" else 5e-9
	assert model.intercept_ == pytest.approx(intercept, rel=rtol)
	assert_allclose(model.coef_, coef, rtol=rtol)


	@pytest.mark.parametrize("solver", SOLVERS)
	@pytest.mark.parametrize("fit_intercept", [True, False])
	def test_glm_regression_unpenalized(solver, fit_intercept, glm_dataset):
	"""Test that unpenalized GLM converges for all solvers to correct solution.

	We work with a simple constructed data set with known solution.
	Note: This checks the minimum norm solution for wide X, i.e.
	n_samples < n_features:
	min \|\|w\|\|_2 subject to w = argmin deviance(X, y, w)
	"""
	model, X, y, coef, _, _, _ = glm_dataset
	n_samples, n_features = X.shape
	alpha = 0 # unpenalized
	params = dict(
	alpha=alpha,
	fit_intercept=fit_intercept,
	solver=solver,
	gradient_tol=1e-12,
	max_iter=1000,
	)

	model = clone(model).set_params(**params)
	if fit_intercept:
	X = X[:, :-1] # remove intercept
	intercept = coef[-1]
	coef = coef[:-1]
	else:
	intercept = 0

	with warnings.catch_warnings():
	if solver.startswith("newton") and n_samples < n_features:
	# The newton solvers should warn and automatically fallback to LBFGS
	# in this case. The model should still converge.
	warnings.filterwarnings("ignore", category=scipy.linalg.LinAlgWarning)
	# XXX: Investigate if the ConvergenceWarning that can appear in some
	# cases should be considered a bug or not. In the mean time we don't
	# fail when the assertions below pass irrespective of the presence of
	# the warning.
	warnings.filterwarnings("ignore", category=ConvergenceWarning)
	model.fit(X, y)

	# FIXME: `assert_allclose(model.coef_, coef)` should work for all cases but fails
	# for the wide/fat case with n_features > n_samples. Most current GLM solvers do
	# NOT return the minimum norm solution with fit_intercept=True.
	if n_samples > n_features:
	rtol = 5e-5 if solver == "lbfgs" else 1e-7
	assert model.intercept_ == pytest.approx(intercept)
	assert_allclose(model.coef_, coef, rtol=rtol)
	else:
	# As it is an underdetermined problem, prediction = y. The following shows that
	# we get a solution, i.e. a (non-unique) minimum of the objective function ...
	rtol = 5e-5
	if solver == "newton-cholesky":
	rtol = 5e-4
	assert_allclose(model.predict(X), y, rtol=rtol)

	norm_solution = np.linalg.norm(np.r_[intercept, coef])
	norm_model = np.linalg.norm(np.r_[model.intercept_, model.coef_])
	if solver == "newton-cholesky":
	# XXX: This solver shows random behaviour. Sometimes it finds solutions
	# with norm_model <= norm_solution! So we check conditionally.
	if norm_model < (1 + 1e-12) * norm_solution:
	assert model.intercept_ == pytest.approx(intercept)
	assert_allclose(model.coef_, coef, rtol=rtol)
	elif solver == "lbfgs" and fit_intercept:
	# But it is not the minimum norm solution. Otherwise the norms would be
	# equal.
	assert norm_model > (1 + 1e-12) * norm_solution

	# See https://github.com/scikit-learn/scikit-learn/issues/23670.
	# Note: Even adding a tiny penalty does not give the minimal norm solution.
	# XXX: We could have naively expected LBFGS to find the minimal norm
	# solution by adding a very small penalty. Even that fails for a reason we
	# do not properly understand at this point.
	else:
	# When `fit_intercept=False`, LBFGS naturally converges to the minimum norm
	# solution on this problem.
	# XXX: Do we have any theoretical guarantees why this should be the case?
	assert model.intercept_ == pytest.approx(intercept, rel=rtol)
	assert_allclose(model.coef_, coef, rtol=rtol)


	@pytest.mark.parametrize("solver", SOLVERS)
	@pytest.mark.parametrize("fit_intercept", [True, False])
	def test_glm_regression_unpenalized_hstacked_X(solver, fit_intercept, glm_dataset):
	"""Test that unpenalized GLM converges for all solvers to correct solution.

	We work with a simple constructed data set with known solution.
	GLM fit on [X] is the same as fit on [X, X]/2.
	For long X, [X, X] is a singular matrix and we check against the minimum norm
	solution:
	min \|\|w\|\|_2 subject to w = argmin deviance(X, y, w)
	"""
	model, X, y, coef, _, _, _ = glm_dataset
	n_samples, n_features = X.shape
	alpha = 0 # unpenalized
	params = dict(
	alpha=alpha,
	fit_intercept=fit_intercept,
	solver=solver,
	gradient_tol=1e-12,
	max_iter=1000,
	)

	model = clone(model).set_params(**params)
	if fit_intercept:
	intercept = coef[-1]
	coef = coef[:-1]
	if n_samples > n_features:
	X = X[:, :-1] # remove intercept
	X = 0.5 * np.concatenate((X, X), axis=1)
	else:
	# To know the minimum norm solution, we keep one intercept column and do
	# not divide by 2. Later on, we must take special care.
	X = np.c_[X[:, :-1], X[:, :-1], X[:, -1]]
	else:
	intercept = 0
	X = 0.5 * np.concatenate((X, X), axis=1)
	assert np.linalg.matrix_rank(X) <= min(n_samples, n_features)

	with warnings.catch_warnings():
	if solver.startswith("newton"):
	# The newton solvers should warn and automatically fallback to LBFGS
	# in this case. The model should still converge.
	warnings.filterwarnings("ignore", category=scipy.linalg.LinAlgWarning)
	# XXX: Investigate if the ConvergenceWarning that can appear in some
	# cases should be considered a bug or not. In the mean time we don't
	# fail when the assertions below pass irrespective of the presence of
	# the warning.
	warnings.filterwarnings("ignore", category=ConvergenceWarning)
	model.fit(X, y)

	if fit_intercept and n_samples < n_features:
	# Here we take special care.
	model_intercept = 2 * model.intercept_
	model_coef = 2 * model.coef_[:-1] # exclude the other intercept term.
	# For minimum norm solution, we would have
	# assert model.intercept_ == pytest.approx(model.coef_[-1])
	else:
	model_intercept = model.intercept_
	model_coef = model.coef_

	if n_samples > n_features:
	assert model_intercept == pytest.approx(intercept)
	rtol = 1e-4
	assert_allclose(model_coef, np.r_[coef, coef], rtol=rtol)
	else:
	# As it is an underdetermined problem, prediction = y. The following shows that
	# we get a solution, i.e. a (non-unique) minimum of the objective function ...
	rtol = 1e-6 if solver == "lbfgs" else 5e-6
	assert_allclose(model.predict(X), y, rtol=rtol)
	if (solver == "lbfgs" and fit_intercept) or solver == "newton-cholesky":
	# Same as in test_glm_regression_unpenalized.
	# But it is not the minimum norm solution. Otherwise the norms would be
	# equal.
	norm_solution = np.linalg.norm(
	0.5 * np.r_[intercept, intercept, coef, coef]
	)
	norm_model = np.linalg.norm(np.r_[model.intercept_, model.coef_])
	assert norm_model > (1 + 1e-12) * norm_solution
	# For minimum norm solution, we would have
	# assert model.intercept_ == pytest.approx(model.coef_[-1])
	else:
	assert model_intercept == pytest.approx(intercept, rel=5e-6)
	assert_allclose(model_coef, np.r_[coef, coef], rtol=1e-4)


	@pytest.mark.parametrize("solver", SOLVERS)
	@pytest.mark.parametrize("fit_intercept", [True, False])
	def test_glm_regression_unpenalized_vstacked_X(solver, fit_intercept, glm_dataset):
	"""Test that unpenalized GLM converges for all solvers to correct solution.

	We work with a simple constructed data set with known solution.
	GLM fit on [X] is the same as fit on [X], [y]
	[X], [y].
	For wide X, [X', X'] is a singular matrix and we check against the minimum norm
	solution:
	min \|\|w\|\|_2 subject to w = argmin deviance(X, y, w)
	"""
	model, X, y, coef, _, _, _ = glm_dataset
	n_samples, n_features = X.shape
	alpha = 0 # unpenalized
	params = dict(
	alpha=alpha,
	fit_intercept=fit_intercept,
	solver=solver,
	gradient_tol=1e-12,
	max_iter=1000,
	)

	model = clone(model).set_params(**params)
	if fit_intercept:
	X = X[:, :-1] # remove intercept
	intercept = coef[-1]
	coef = coef[:-1]
	else:
	intercept = 0
	X = np.concatenate((X, X), axis=0)
	assert np.linalg.matrix_rank(X) <= min(n_samples, n_features)
	y = np.r_[y, y]

	with warnings.catch_warnings():
	if solver.startswith("newton") and n_samples < n_features:
	# The newton solvers should warn and automatically fallback to LBFGS
	# in this case. The model should still converge.
	warnings.filterwarnings("ignore", category=scipy.linalg.LinAlgWarning)
	# XXX: Investigate if the ConvergenceWarning that can appear in some
	# cases should be considered a bug or not. In the mean time we don't
	# fail when the assertions below pass irrespective of the presence of
	# the warning.
	warnings.filterwarnings("ignore", category=ConvergenceWarning)
	model.fit(X, y)

	if n_samples > n_features:
	rtol = 5e-5 if solver == "lbfgs" else 1e-6
	assert model.intercept_ == pytest.approx(intercept)
	assert_allclose(model.coef_, coef, rtol=rtol)
	else:
	# As it is an underdetermined problem, prediction = y. The following shows that
	# we get a solution, i.e. a (non-unique) minimum of the objective function ...
	rtol = 1e-6 if solver == "lbfgs" else 5e-6
	assert_allclose(model.predict(X), y, rtol=rtol)

	norm_solution = np.linalg.norm(np.r_[intercept, coef])
	norm_model = np.linalg.norm(np.r_[model.intercept_, model.coef_])
	if solver == "newton-cholesky":
	# XXX: This solver shows random behaviour. Sometimes it finds solutions
	# with norm_model <= norm_solution! So we check conditionally.
	if not (norm_model > (1 + 1e-12) * norm_solution):
	assert model.intercept_ == pytest.approx(intercept)
	assert_allclose(model.coef_, coef, rtol=1e-4)
	elif solver == "lbfgs" and fit_intercept:
	# Same as in test_glm_regression_unpenalized.
	# But it is not the minimum norm solution. Otherwise the norms would be
	# equal.
	assert norm_model > (1 + 1e-12) * norm_solution
	else:
	rtol = 1e-5 if solver == "newton-cholesky" else 1e-4
	assert model.intercept_ == pytest.approx(intercept, rel=rtol)
	assert_allclose(model.coef_, coef, rtol=rtol)