Skip to content

Instantly share code, notes, and snippets.

@lorentzenchr
Created October 31, 2023 20:53
Show Gist options
  • Save lorentzenchr/2e319bcfd4aadfbea64c6330e5b33521 to your computer and use it in GitHub Desktop.
Save lorentzenchr/2e319bcfd4aadfbea64c6330e5b33521 to your computer and use it in GitHub Desktop.
Scikit-learn GLM tests for Glum
# Authors: Christian Lorentzen <lorentzen.ch@gmail.com>
#
# License: BSD 3 clause
import itertools
import warnings
from functools import partial
import numpy as np
import pytest
import scipy
from numpy.testing import assert_allclose
from scipy import linalg
from scipy.optimize import minimize, root
from sklearn._loss import HalfBinomialLoss, HalfGammaLoss, HalfPoissonLoss, HalfTweedieLoss
from sklearn._loss.link import IdentityLink, LogLink
from sklearn.base import clone
from sklearn.datasets import make_low_rank_matrix, make_regression
from sklearn.exceptions import ConvergenceWarning
from sklearn.linear_model import (
GammaRegressor,
PoissonRegressor,
Ridge,
TweedieRegressor,
)
from sklearn.linear_model._glm import _GeneralizedLinearRegressor
from sklearn.linear_model._glm._newton_solver import NewtonCholeskySolver
from sklearn.linear_model._linear_loss import LinearModelLoss
from sklearn.metrics import d2_tweedie_score, mean_poisson_deviance
from sklearn.model_selection import train_test_split
from glum import GeneralizedLinearRegressor
SOLVERS = ["lbfgs", "irls-ls", "irls-cd"]
def _special_minimize(fun, grad, x, tol_NM, tol):
# Find good starting point by Nelder-Mead
res_NM = minimize(
fun, x, method="Nelder-Mead", options={"xatol": tol_NM, "fatol": tol_NM}
)
# Now refine via root finding on the gradient of the function, which is
# more precise than minimizing the function itself.
res = root(
grad,
res_NM.x,
method="lm",
options={"ftol": tol, "xtol": tol, "gtol": tol},
)
return res.x
@pytest.fixture
def global_random_seed():
return 42
@pytest.fixture(
params=itertools.product(
["long", "wide"],
[
GeneralizedLinearRegressor(family="binomial"),
GeneralizedLinearRegressor(family="poisson"),
GeneralizedLinearRegressor(family="gamma"),
GeneralizedLinearRegressor(family="tweedie"),
],
),
ids=lambda param: f"{param[0]}-{param[1]}",
)
def glm_dataset(global_random_seed, request):
"""Dataset with GLM solutions, well conditioned X.
This is inspired by ols_ridge_dataset in test_ridge.py.
The construction is based on the SVD decomposition of X = U S V'.
Parameters
----------
type : {"long", "wide"}
If "long", then n_samples > n_features.
If "wide", then n_features > n_samples.
model : a GLM model
For "wide", we return the minimum norm solution:
min ||w||_2 subject to w = argmin deviance(X, y, w)
Note that the deviance is always minimized if y = inverse_link(X w) is possible to
achieve, which it is in the wide data case. Therefore, we can construct the
solution with minimum norm like (wide) OLS:
min ||w||_2 subject to link(y) = raw_prediction = X w
Returns
-------
model : GLM model
X : ndarray
Last column of 1, i.e. intercept.
y : ndarray
coef_unpenalized : ndarray
Minimum norm solutions, i.e. min sum(loss(w)) (with minimum ||w||_2 in
case of ambiguity)
Last coefficient is intercept.
coef_penalized : ndarray
GLM solution with alpha=l2_reg_strength=1, i.e.
min 1/n * sum(loss) + ||w[:-1]||_2^2.
Last coefficient is intercept.
l2_reg_strength : float
Always equal 1.
"""
data_type, model = request.param
# Make larger dim more than double as big as the smaller one.
# This helps when constructing singular matrices like (X, X).
if data_type == "long":
n_samples, n_features = 12, 4
else:
n_samples, n_features = 4, 12
k = min(n_samples, n_features)
rng = np.random.RandomState(global_random_seed)
X = make_low_rank_matrix(
n_samples=n_samples,
n_features=n_features,
effective_rank=k,
tail_strength=0.1,
random_state=rng,
)
X[:, -1] = 1 # last columns acts as intercept
U, s, Vt = linalg.svd(X, full_matrices=False)
assert np.all(s > 1e-3) # to be sure
assert np.max(s) / np.min(s) < 100 # condition number of X
if data_type == "long":
coef_unpenalized = rng.uniform(low=1, high=3, size=n_features)
coef_unpenalized *= rng.choice([-1, 1], size=n_features)
raw_prediction = X @ coef_unpenalized
else:
raw_prediction = rng.uniform(low=-3, high=3, size=n_samples)
# minimum norm solution min ||w||_2 such that raw_prediction = X w:
# w = X'(XX')^-1 raw_prediction = V s^-1 U' raw_prediction
coef_unpenalized = Vt.T @ np.diag(1 / s) @ U.T @ raw_prediction
if model.family == "binomial":
loss = HalfBinomialLoss()
elif model.family == "poisson":
loss = HalfPoissonLoss()
elif model.family == "gamma":
loss = HalfGammaLoss()
elif model.family == "tweedie":
loss = HalfTweedieLoss(power=1.5)
linear_loss = LinearModelLoss(base_loss=loss, fit_intercept=True)
sw = np.full(shape=n_samples, fill_value=1 / n_samples)
y = linear_loss.base_loss.link.inverse(raw_prediction)
# Add penalty l2_reg_strength * ||coef||_2^2 for l2_reg_strength=1 and solve with
# optimizer. Note that the problem is well conditioned such that we get accurate
# results.
l2_reg_strength = 1
fun = partial(
linear_loss.loss,
X=X[:, :-1],
y=y,
sample_weight=sw,
l2_reg_strength=l2_reg_strength,
)
grad = partial(
linear_loss.gradient,
X=X[:, :-1],
y=y,
sample_weight=sw,
l2_reg_strength=l2_reg_strength,
)
coef_penalized_with_intercept = _special_minimize(
fun, grad, coef_unpenalized, tol_NM=1e-6, tol=1e-14
)
linear_loss = LinearModelLoss(base_loss=loss, fit_intercept=False)
fun = partial(
linear_loss.loss,
X=X[:, :-1],
y=y,
sample_weight=sw,
l2_reg_strength=l2_reg_strength,
)
grad = partial(
linear_loss.gradient,
X=X[:, :-1],
y=y,
sample_weight=sw,
l2_reg_strength=l2_reg_strength,
)
coef_penalized_without_intercept = _special_minimize(
fun, grad, coef_unpenalized[:-1], tol_NM=1e-6, tol=1e-14
)
# To be sure
assert np.linalg.norm(coef_penalized_with_intercept) < np.linalg.norm(
coef_unpenalized
)
return (
model,
X,
y,
coef_unpenalized,
coef_penalized_with_intercept,
coef_penalized_without_intercept,
l2_reg_strength,
)
@pytest.mark.parametrize("solver", SOLVERS)
@pytest.mark.parametrize("fit_intercept", [False, True])
def test_glm_regression(solver, fit_intercept, glm_dataset):
"""Test that GLM converges for all solvers to correct solution.
We work with a simple constructed data set with known solution.
"""
model, X, y, _, coef_with_intercept, coef_without_intercept, alpha = glm_dataset
params = dict(
alpha=alpha,
fit_intercept=fit_intercept,
solver=solver,
gradient_tol=1e-12,
max_iter=1000,
)
model = clone(model).set_params(**params)
X = X[:, :-1] # remove intercept
if fit_intercept:
coef = coef_with_intercept
intercept = coef[-1]
coef = coef[:-1]
else:
coef = coef_without_intercept
intercept = 0
model.fit(X, y)
rtol = 5e-5 if solver == "lbfgs" else 1e-9
assert model.intercept_ == pytest.approx(intercept, rel=rtol)
assert_allclose(model.coef_, coef, rtol=rtol)
# Same with sample_weight.
model = (
clone(model).set_params(**params).fit(X, y, sample_weight=np.ones(X.shape[0]))
)
assert model.intercept_ == pytest.approx(intercept, rel=rtol)
assert_allclose(model.coef_, coef, rtol=rtol)
@pytest.mark.parametrize("solver", SOLVERS)
@pytest.mark.parametrize("fit_intercept", [True, False])
def test_glm_regression_hstacked_X(solver, fit_intercept, glm_dataset):
"""Test that GLM converges for all solvers to correct solution on hstacked data.
We work with a simple constructed data set with known solution.
Fit on [X] with alpha is the same as fit on [X, X]/2 with alpha/2.
For long X, [X, X] is still a long but singular matrix.
"""
model, X, y, _, coef_with_intercept, coef_without_intercept, alpha = glm_dataset
n_samples, n_features = X.shape
params = dict(
alpha=alpha / 2,
fit_intercept=fit_intercept,
solver=solver,
gradient_tol=1e-12,
max_iter=1000,
)
model = clone(model).set_params(**params)
X = X[:, :-1] # remove intercept
X = 0.5 * np.concatenate((X, X), axis=1)
assert np.linalg.matrix_rank(X) <= min(n_samples, n_features - 1)
if fit_intercept:
coef = coef_with_intercept
intercept = coef[-1]
coef = coef[:-1]
else:
coef = coef_without_intercept
intercept = 0
with warnings.catch_warnings():
# XXX: Investigate if the ConvergenceWarning that can appear in some
# cases should be considered a bug or not. In the mean time we don't
# fail when the assertions below pass irrespective of the presence of
# the warning.
warnings.simplefilter("ignore", ConvergenceWarning)
model.fit(X, y)
rtol = 2e-4 if solver == "lbfgs" else 5e-9
assert model.intercept_ == pytest.approx(intercept, rel=rtol)
assert_allclose(model.coef_, np.r_[coef, coef], rtol=rtol)
@pytest.mark.parametrize("solver", SOLVERS)
@pytest.mark.parametrize("fit_intercept", [True, False])
def test_glm_regression_vstacked_X(solver, fit_intercept, glm_dataset):
"""Test that GLM converges for all solvers to correct solution on vstacked data.
We work with a simple constructed data set with known solution.
Fit on [X] with alpha is the same as fit on [X], [y]
[X], [y] with 1 * alpha.
It is the same alpha as the average loss stays the same.
For wide X, [X', X'] is a singular matrix.
"""
model, X, y, _, coef_with_intercept, coef_without_intercept, alpha = glm_dataset
n_samples, n_features = X.shape
params = dict(
alpha=alpha,
fit_intercept=fit_intercept,
solver=solver,
gradient_tol=1e-12,
max_iter=1000,
)
model = clone(model).set_params(**params)
X = X[:, :-1] # remove intercept
X = np.concatenate((X, X), axis=0)
assert np.linalg.matrix_rank(X) <= min(n_samples, n_features)
y = np.r_[y, y]
if fit_intercept:
coef = coef_with_intercept
intercept = coef[-1]
coef = coef[:-1]
else:
coef = coef_without_intercept
intercept = 0
model.fit(X, y)
rtol = 3e-5 if solver == "lbfgs" else 5e-9
assert model.intercept_ == pytest.approx(intercept, rel=rtol)
assert_allclose(model.coef_, coef, rtol=rtol)
@pytest.mark.parametrize("solver", SOLVERS)
@pytest.mark.parametrize("fit_intercept", [True, False])
def test_glm_regression_unpenalized(solver, fit_intercept, glm_dataset):
"""Test that unpenalized GLM converges for all solvers to correct solution.
We work with a simple constructed data set with known solution.
Note: This checks the minimum norm solution for wide X, i.e.
n_samples < n_features:
min ||w||_2 subject to w = argmin deviance(X, y, w)
"""
model, X, y, coef, _, _, _ = glm_dataset
n_samples, n_features = X.shape
alpha = 0 # unpenalized
params = dict(
alpha=alpha,
fit_intercept=fit_intercept,
solver=solver,
gradient_tol=1e-12,
max_iter=1000,
)
model = clone(model).set_params(**params)
if fit_intercept:
X = X[:, :-1] # remove intercept
intercept = coef[-1]
coef = coef[:-1]
else:
intercept = 0
with warnings.catch_warnings():
if solver.startswith("newton") and n_samples < n_features:
# The newton solvers should warn and automatically fallback to LBFGS
# in this case. The model should still converge.
warnings.filterwarnings("ignore", category=scipy.linalg.LinAlgWarning)
# XXX: Investigate if the ConvergenceWarning that can appear in some
# cases should be considered a bug or not. In the mean time we don't
# fail when the assertions below pass irrespective of the presence of
# the warning.
warnings.filterwarnings("ignore", category=ConvergenceWarning)
model.fit(X, y)
# FIXME: `assert_allclose(model.coef_, coef)` should work for all cases but fails
# for the wide/fat case with n_features > n_samples. Most current GLM solvers do
# NOT return the minimum norm solution with fit_intercept=True.
if n_samples > n_features:
rtol = 5e-5 if solver == "lbfgs" else 1e-7
assert model.intercept_ == pytest.approx(intercept)
assert_allclose(model.coef_, coef, rtol=rtol)
else:
# As it is an underdetermined problem, prediction = y. The following shows that
# we get a solution, i.e. a (non-unique) minimum of the objective function ...
rtol = 5e-5
if solver == "newton-cholesky":
rtol = 5e-4
assert_allclose(model.predict(X), y, rtol=rtol)
norm_solution = np.linalg.norm(np.r_[intercept, coef])
norm_model = np.linalg.norm(np.r_[model.intercept_, model.coef_])
if solver == "newton-cholesky":
# XXX: This solver shows random behaviour. Sometimes it finds solutions
# with norm_model <= norm_solution! So we check conditionally.
if norm_model < (1 + 1e-12) * norm_solution:
assert model.intercept_ == pytest.approx(intercept)
assert_allclose(model.coef_, coef, rtol=rtol)
elif solver == "lbfgs" and fit_intercept:
# But it is not the minimum norm solution. Otherwise the norms would be
# equal.
assert norm_model > (1 + 1e-12) * norm_solution
# See https://github.com/scikit-learn/scikit-learn/issues/23670.
# Note: Even adding a tiny penalty does not give the minimal norm solution.
# XXX: We could have naively expected LBFGS to find the minimal norm
# solution by adding a very small penalty. Even that fails for a reason we
# do not properly understand at this point.
else:
# When `fit_intercept=False`, LBFGS naturally converges to the minimum norm
# solution on this problem.
# XXX: Do we have any theoretical guarantees why this should be the case?
assert model.intercept_ == pytest.approx(intercept, rel=rtol)
assert_allclose(model.coef_, coef, rtol=rtol)
@pytest.mark.parametrize("solver", SOLVERS)
@pytest.mark.parametrize("fit_intercept", [True, False])
def test_glm_regression_unpenalized_hstacked_X(solver, fit_intercept, glm_dataset):
"""Test that unpenalized GLM converges for all solvers to correct solution.
We work with a simple constructed data set with known solution.
GLM fit on [X] is the same as fit on [X, X]/2.
For long X, [X, X] is a singular matrix and we check against the minimum norm
solution:
min ||w||_2 subject to w = argmin deviance(X, y, w)
"""
model, X, y, coef, _, _, _ = glm_dataset
n_samples, n_features = X.shape
alpha = 0 # unpenalized
params = dict(
alpha=alpha,
fit_intercept=fit_intercept,
solver=solver,
gradient_tol=1e-12,
max_iter=1000,
)
model = clone(model).set_params(**params)
if fit_intercept:
intercept = coef[-1]
coef = coef[:-1]
if n_samples > n_features:
X = X[:, :-1] # remove intercept
X = 0.5 * np.concatenate((X, X), axis=1)
else:
# To know the minimum norm solution, we keep one intercept column and do
# not divide by 2. Later on, we must take special care.
X = np.c_[X[:, :-1], X[:, :-1], X[:, -1]]
else:
intercept = 0
X = 0.5 * np.concatenate((X, X), axis=1)
assert np.linalg.matrix_rank(X) <= min(n_samples, n_features)
with warnings.catch_warnings():
if solver.startswith("newton"):
# The newton solvers should warn and automatically fallback to LBFGS
# in this case. The model should still converge.
warnings.filterwarnings("ignore", category=scipy.linalg.LinAlgWarning)
# XXX: Investigate if the ConvergenceWarning that can appear in some
# cases should be considered a bug or not. In the mean time we don't
# fail when the assertions below pass irrespective of the presence of
# the warning.
warnings.filterwarnings("ignore", category=ConvergenceWarning)
model.fit(X, y)
if fit_intercept and n_samples < n_features:
# Here we take special care.
model_intercept = 2 * model.intercept_
model_coef = 2 * model.coef_[:-1] # exclude the other intercept term.
# For minimum norm solution, we would have
# assert model.intercept_ == pytest.approx(model.coef_[-1])
else:
model_intercept = model.intercept_
model_coef = model.coef_
if n_samples > n_features:
assert model_intercept == pytest.approx(intercept)
rtol = 1e-4
assert_allclose(model_coef, np.r_[coef, coef], rtol=rtol)
else:
# As it is an underdetermined problem, prediction = y. The following shows that
# we get a solution, i.e. a (non-unique) minimum of the objective function ...
rtol = 1e-6 if solver == "lbfgs" else 5e-6
assert_allclose(model.predict(X), y, rtol=rtol)
if (solver == "lbfgs" and fit_intercept) or solver == "newton-cholesky":
# Same as in test_glm_regression_unpenalized.
# But it is not the minimum norm solution. Otherwise the norms would be
# equal.
norm_solution = np.linalg.norm(
0.5 * np.r_[intercept, intercept, coef, coef]
)
norm_model = np.linalg.norm(np.r_[model.intercept_, model.coef_])
assert norm_model > (1 + 1e-12) * norm_solution
# For minimum norm solution, we would have
# assert model.intercept_ == pytest.approx(model.coef_[-1])
else:
assert model_intercept == pytest.approx(intercept, rel=5e-6)
assert_allclose(model_coef, np.r_[coef, coef], rtol=1e-4)
@pytest.mark.parametrize("solver", SOLVERS)
@pytest.mark.parametrize("fit_intercept", [True, False])
def test_glm_regression_unpenalized_vstacked_X(solver, fit_intercept, glm_dataset):
"""Test that unpenalized GLM converges for all solvers to correct solution.
We work with a simple constructed data set with known solution.
GLM fit on [X] is the same as fit on [X], [y]
[X], [y].
For wide X, [X', X'] is a singular matrix and we check against the minimum norm
solution:
min ||w||_2 subject to w = argmin deviance(X, y, w)
"""
model, X, y, coef, _, _, _ = glm_dataset
n_samples, n_features = X.shape
alpha = 0 # unpenalized
params = dict(
alpha=alpha,
fit_intercept=fit_intercept,
solver=solver,
gradient_tol=1e-12,
max_iter=1000,
)
model = clone(model).set_params(**params)
if fit_intercept:
X = X[:, :-1] # remove intercept
intercept = coef[-1]
coef = coef[:-1]
else:
intercept = 0
X = np.concatenate((X, X), axis=0)
assert np.linalg.matrix_rank(X) <= min(n_samples, n_features)
y = np.r_[y, y]
with warnings.catch_warnings():
if solver.startswith("newton") and n_samples < n_features:
# The newton solvers should warn and automatically fallback to LBFGS
# in this case. The model should still converge.
warnings.filterwarnings("ignore", category=scipy.linalg.LinAlgWarning)
# XXX: Investigate if the ConvergenceWarning that can appear in some
# cases should be considered a bug or not. In the mean time we don't
# fail when the assertions below pass irrespective of the presence of
# the warning.
warnings.filterwarnings("ignore", category=ConvergenceWarning)
model.fit(X, y)
if n_samples > n_features:
rtol = 5e-5 if solver == "lbfgs" else 1e-6
assert model.intercept_ == pytest.approx(intercept)
assert_allclose(model.coef_, coef, rtol=rtol)
else:
# As it is an underdetermined problem, prediction = y. The following shows that
# we get a solution, i.e. a (non-unique) minimum of the objective function ...
rtol = 1e-6 if solver == "lbfgs" else 5e-6
assert_allclose(model.predict(X), y, rtol=rtol)
norm_solution = np.linalg.norm(np.r_[intercept, coef])
norm_model = np.linalg.norm(np.r_[model.intercept_, model.coef_])
if solver == "newton-cholesky":
# XXX: This solver shows random behaviour. Sometimes it finds solutions
# with norm_model <= norm_solution! So we check conditionally.
if not (norm_model > (1 + 1e-12) * norm_solution):
assert model.intercept_ == pytest.approx(intercept)
assert_allclose(model.coef_, coef, rtol=1e-4)
elif solver == "lbfgs" and fit_intercept:
# Same as in test_glm_regression_unpenalized.
# But it is not the minimum norm solution. Otherwise the norms would be
# equal.
assert norm_model > (1 + 1e-12) * norm_solution
else:
rtol = 1e-5 if solver == "newton-cholesky" else 1e-4
assert model.intercept_ == pytest.approx(intercept, rel=rtol)
assert_allclose(model.coef_, coef, rtol=rtol)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment