-
-
Save lorentzenchr/2e319bcfd4aadfbea64c6330e5b33521 to your computer and use it in GitHub Desktop.
Scikit-learn GLM tests for Glum
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Authors: Christian Lorentzen <lorentzen.ch@gmail.com> | |
# | |
# License: BSD 3 clause | |
import itertools | |
import warnings | |
from functools import partial | |
import numpy as np | |
import pytest | |
import scipy | |
from numpy.testing import assert_allclose | |
from scipy import linalg | |
from scipy.optimize import minimize, root | |
from sklearn._loss import HalfBinomialLoss, HalfGammaLoss, HalfPoissonLoss, HalfTweedieLoss | |
from sklearn._loss.link import IdentityLink, LogLink | |
from sklearn.base import clone | |
from sklearn.datasets import make_low_rank_matrix, make_regression | |
from sklearn.exceptions import ConvergenceWarning | |
from sklearn.linear_model import ( | |
GammaRegressor, | |
PoissonRegressor, | |
Ridge, | |
TweedieRegressor, | |
) | |
from sklearn.linear_model._glm import _GeneralizedLinearRegressor | |
from sklearn.linear_model._glm._newton_solver import NewtonCholeskySolver | |
from sklearn.linear_model._linear_loss import LinearModelLoss | |
from sklearn.metrics import d2_tweedie_score, mean_poisson_deviance | |
from sklearn.model_selection import train_test_split | |
from glum import GeneralizedLinearRegressor | |
SOLVERS = ["lbfgs", "irls-ls", "irls-cd"] | |
def _special_minimize(fun, grad, x, tol_NM, tol): | |
# Find good starting point by Nelder-Mead | |
res_NM = minimize( | |
fun, x, method="Nelder-Mead", options={"xatol": tol_NM, "fatol": tol_NM} | |
) | |
# Now refine via root finding on the gradient of the function, which is | |
# more precise than minimizing the function itself. | |
res = root( | |
grad, | |
res_NM.x, | |
method="lm", | |
options={"ftol": tol, "xtol": tol, "gtol": tol}, | |
) | |
return res.x | |
@pytest.fixture | |
def global_random_seed(): | |
return 42 | |
@pytest.fixture( | |
params=itertools.product( | |
["long", "wide"], | |
[ | |
GeneralizedLinearRegressor(family="binomial"), | |
GeneralizedLinearRegressor(family="poisson"), | |
GeneralizedLinearRegressor(family="gamma"), | |
GeneralizedLinearRegressor(family="tweedie"), | |
], | |
), | |
ids=lambda param: f"{param[0]}-{param[1]}", | |
) | |
def glm_dataset(global_random_seed, request): | |
"""Dataset with GLM solutions, well conditioned X. | |
This is inspired by ols_ridge_dataset in test_ridge.py. | |
The construction is based on the SVD decomposition of X = U S V'. | |
Parameters | |
---------- | |
type : {"long", "wide"} | |
If "long", then n_samples > n_features. | |
If "wide", then n_features > n_samples. | |
model : a GLM model | |
For "wide", we return the minimum norm solution: | |
min ||w||_2 subject to w = argmin deviance(X, y, w) | |
Note that the deviance is always minimized if y = inverse_link(X w) is possible to | |
achieve, which it is in the wide data case. Therefore, we can construct the | |
solution with minimum norm like (wide) OLS: | |
min ||w||_2 subject to link(y) = raw_prediction = X w | |
Returns | |
------- | |
model : GLM model | |
X : ndarray | |
Last column of 1, i.e. intercept. | |
y : ndarray | |
coef_unpenalized : ndarray | |
Minimum norm solutions, i.e. min sum(loss(w)) (with minimum ||w||_2 in | |
case of ambiguity) | |
Last coefficient is intercept. | |
coef_penalized : ndarray | |
GLM solution with alpha=l2_reg_strength=1, i.e. | |
min 1/n * sum(loss) + ||w[:-1]||_2^2. | |
Last coefficient is intercept. | |
l2_reg_strength : float | |
Always equal 1. | |
""" | |
data_type, model = request.param | |
# Make larger dim more than double as big as the smaller one. | |
# This helps when constructing singular matrices like (X, X). | |
if data_type == "long": | |
n_samples, n_features = 12, 4 | |
else: | |
n_samples, n_features = 4, 12 | |
k = min(n_samples, n_features) | |
rng = np.random.RandomState(global_random_seed) | |
X = make_low_rank_matrix( | |
n_samples=n_samples, | |
n_features=n_features, | |
effective_rank=k, | |
tail_strength=0.1, | |
random_state=rng, | |
) | |
X[:, -1] = 1 # last columns acts as intercept | |
U, s, Vt = linalg.svd(X, full_matrices=False) | |
assert np.all(s > 1e-3) # to be sure | |
assert np.max(s) / np.min(s) < 100 # condition number of X | |
if data_type == "long": | |
coef_unpenalized = rng.uniform(low=1, high=3, size=n_features) | |
coef_unpenalized *= rng.choice([-1, 1], size=n_features) | |
raw_prediction = X @ coef_unpenalized | |
else: | |
raw_prediction = rng.uniform(low=-3, high=3, size=n_samples) | |
# minimum norm solution min ||w||_2 such that raw_prediction = X w: | |
# w = X'(XX')^-1 raw_prediction = V s^-1 U' raw_prediction | |
coef_unpenalized = Vt.T @ np.diag(1 / s) @ U.T @ raw_prediction | |
if model.family == "binomial": | |
loss = HalfBinomialLoss() | |
elif model.family == "poisson": | |
loss = HalfPoissonLoss() | |
elif model.family == "gamma": | |
loss = HalfGammaLoss() | |
elif model.family == "tweedie": | |
loss = HalfTweedieLoss(power=1.5) | |
linear_loss = LinearModelLoss(base_loss=loss, fit_intercept=True) | |
sw = np.full(shape=n_samples, fill_value=1 / n_samples) | |
y = linear_loss.base_loss.link.inverse(raw_prediction) | |
# Add penalty l2_reg_strength * ||coef||_2^2 for l2_reg_strength=1 and solve with | |
# optimizer. Note that the problem is well conditioned such that we get accurate | |
# results. | |
l2_reg_strength = 1 | |
fun = partial( | |
linear_loss.loss, | |
X=X[:, :-1], | |
y=y, | |
sample_weight=sw, | |
l2_reg_strength=l2_reg_strength, | |
) | |
grad = partial( | |
linear_loss.gradient, | |
X=X[:, :-1], | |
y=y, | |
sample_weight=sw, | |
l2_reg_strength=l2_reg_strength, | |
) | |
coef_penalized_with_intercept = _special_minimize( | |
fun, grad, coef_unpenalized, tol_NM=1e-6, tol=1e-14 | |
) | |
linear_loss = LinearModelLoss(base_loss=loss, fit_intercept=False) | |
fun = partial( | |
linear_loss.loss, | |
X=X[:, :-1], | |
y=y, | |
sample_weight=sw, | |
l2_reg_strength=l2_reg_strength, | |
) | |
grad = partial( | |
linear_loss.gradient, | |
X=X[:, :-1], | |
y=y, | |
sample_weight=sw, | |
l2_reg_strength=l2_reg_strength, | |
) | |
coef_penalized_without_intercept = _special_minimize( | |
fun, grad, coef_unpenalized[:-1], tol_NM=1e-6, tol=1e-14 | |
) | |
# To be sure | |
assert np.linalg.norm(coef_penalized_with_intercept) < np.linalg.norm( | |
coef_unpenalized | |
) | |
return ( | |
model, | |
X, | |
y, | |
coef_unpenalized, | |
coef_penalized_with_intercept, | |
coef_penalized_without_intercept, | |
l2_reg_strength, | |
) | |
@pytest.mark.parametrize("solver", SOLVERS) | |
@pytest.mark.parametrize("fit_intercept", [False, True]) | |
def test_glm_regression(solver, fit_intercept, glm_dataset): | |
"""Test that GLM converges for all solvers to correct solution. | |
We work with a simple constructed data set with known solution. | |
""" | |
model, X, y, _, coef_with_intercept, coef_without_intercept, alpha = glm_dataset | |
params = dict( | |
alpha=alpha, | |
fit_intercept=fit_intercept, | |
solver=solver, | |
gradient_tol=1e-12, | |
max_iter=1000, | |
) | |
model = clone(model).set_params(**params) | |
X = X[:, :-1] # remove intercept | |
if fit_intercept: | |
coef = coef_with_intercept | |
intercept = coef[-1] | |
coef = coef[:-1] | |
else: | |
coef = coef_without_intercept | |
intercept = 0 | |
model.fit(X, y) | |
rtol = 5e-5 if solver == "lbfgs" else 1e-9 | |
assert model.intercept_ == pytest.approx(intercept, rel=rtol) | |
assert_allclose(model.coef_, coef, rtol=rtol) | |
# Same with sample_weight. | |
model = ( | |
clone(model).set_params(**params).fit(X, y, sample_weight=np.ones(X.shape[0])) | |
) | |
assert model.intercept_ == pytest.approx(intercept, rel=rtol) | |
assert_allclose(model.coef_, coef, rtol=rtol) | |
@pytest.mark.parametrize("solver", SOLVERS) | |
@pytest.mark.parametrize("fit_intercept", [True, False]) | |
def test_glm_regression_hstacked_X(solver, fit_intercept, glm_dataset): | |
"""Test that GLM converges for all solvers to correct solution on hstacked data. | |
We work with a simple constructed data set with known solution. | |
Fit on [X] with alpha is the same as fit on [X, X]/2 with alpha/2. | |
For long X, [X, X] is still a long but singular matrix. | |
""" | |
model, X, y, _, coef_with_intercept, coef_without_intercept, alpha = glm_dataset | |
n_samples, n_features = X.shape | |
params = dict( | |
alpha=alpha / 2, | |
fit_intercept=fit_intercept, | |
solver=solver, | |
gradient_tol=1e-12, | |
max_iter=1000, | |
) | |
model = clone(model).set_params(**params) | |
X = X[:, :-1] # remove intercept | |
X = 0.5 * np.concatenate((X, X), axis=1) | |
assert np.linalg.matrix_rank(X) <= min(n_samples, n_features - 1) | |
if fit_intercept: | |
coef = coef_with_intercept | |
intercept = coef[-1] | |
coef = coef[:-1] | |
else: | |
coef = coef_without_intercept | |
intercept = 0 | |
with warnings.catch_warnings(): | |
# XXX: Investigate if the ConvergenceWarning that can appear in some | |
# cases should be considered a bug or not. In the mean time we don't | |
# fail when the assertions below pass irrespective of the presence of | |
# the warning. | |
warnings.simplefilter("ignore", ConvergenceWarning) | |
model.fit(X, y) | |
rtol = 2e-4 if solver == "lbfgs" else 5e-9 | |
assert model.intercept_ == pytest.approx(intercept, rel=rtol) | |
assert_allclose(model.coef_, np.r_[coef, coef], rtol=rtol) | |
@pytest.mark.parametrize("solver", SOLVERS) | |
@pytest.mark.parametrize("fit_intercept", [True, False]) | |
def test_glm_regression_vstacked_X(solver, fit_intercept, glm_dataset): | |
"""Test that GLM converges for all solvers to correct solution on vstacked data. | |
We work with a simple constructed data set with known solution. | |
Fit on [X] with alpha is the same as fit on [X], [y] | |
[X], [y] with 1 * alpha. | |
It is the same alpha as the average loss stays the same. | |
For wide X, [X', X'] is a singular matrix. | |
""" | |
model, X, y, _, coef_with_intercept, coef_without_intercept, alpha = glm_dataset | |
n_samples, n_features = X.shape | |
params = dict( | |
alpha=alpha, | |
fit_intercept=fit_intercept, | |
solver=solver, | |
gradient_tol=1e-12, | |
max_iter=1000, | |
) | |
model = clone(model).set_params(**params) | |
X = X[:, :-1] # remove intercept | |
X = np.concatenate((X, X), axis=0) | |
assert np.linalg.matrix_rank(X) <= min(n_samples, n_features) | |
y = np.r_[y, y] | |
if fit_intercept: | |
coef = coef_with_intercept | |
intercept = coef[-1] | |
coef = coef[:-1] | |
else: | |
coef = coef_without_intercept | |
intercept = 0 | |
model.fit(X, y) | |
rtol = 3e-5 if solver == "lbfgs" else 5e-9 | |
assert model.intercept_ == pytest.approx(intercept, rel=rtol) | |
assert_allclose(model.coef_, coef, rtol=rtol) | |
@pytest.mark.parametrize("solver", SOLVERS) | |
@pytest.mark.parametrize("fit_intercept", [True, False]) | |
def test_glm_regression_unpenalized(solver, fit_intercept, glm_dataset): | |
"""Test that unpenalized GLM converges for all solvers to correct solution. | |
We work with a simple constructed data set with known solution. | |
Note: This checks the minimum norm solution for wide X, i.e. | |
n_samples < n_features: | |
min ||w||_2 subject to w = argmin deviance(X, y, w) | |
""" | |
model, X, y, coef, _, _, _ = glm_dataset | |
n_samples, n_features = X.shape | |
alpha = 0 # unpenalized | |
params = dict( | |
alpha=alpha, | |
fit_intercept=fit_intercept, | |
solver=solver, | |
gradient_tol=1e-12, | |
max_iter=1000, | |
) | |
model = clone(model).set_params(**params) | |
if fit_intercept: | |
X = X[:, :-1] # remove intercept | |
intercept = coef[-1] | |
coef = coef[:-1] | |
else: | |
intercept = 0 | |
with warnings.catch_warnings(): | |
if solver.startswith("newton") and n_samples < n_features: | |
# The newton solvers should warn and automatically fallback to LBFGS | |
# in this case. The model should still converge. | |
warnings.filterwarnings("ignore", category=scipy.linalg.LinAlgWarning) | |
# XXX: Investigate if the ConvergenceWarning that can appear in some | |
# cases should be considered a bug or not. In the mean time we don't | |
# fail when the assertions below pass irrespective of the presence of | |
# the warning. | |
warnings.filterwarnings("ignore", category=ConvergenceWarning) | |
model.fit(X, y) | |
# FIXME: `assert_allclose(model.coef_, coef)` should work for all cases but fails | |
# for the wide/fat case with n_features > n_samples. Most current GLM solvers do | |
# NOT return the minimum norm solution with fit_intercept=True. | |
if n_samples > n_features: | |
rtol = 5e-5 if solver == "lbfgs" else 1e-7 | |
assert model.intercept_ == pytest.approx(intercept) | |
assert_allclose(model.coef_, coef, rtol=rtol) | |
else: | |
# As it is an underdetermined problem, prediction = y. The following shows that | |
# we get a solution, i.e. a (non-unique) minimum of the objective function ... | |
rtol = 5e-5 | |
if solver == "newton-cholesky": | |
rtol = 5e-4 | |
assert_allclose(model.predict(X), y, rtol=rtol) | |
norm_solution = np.linalg.norm(np.r_[intercept, coef]) | |
norm_model = np.linalg.norm(np.r_[model.intercept_, model.coef_]) | |
if solver == "newton-cholesky": | |
# XXX: This solver shows random behaviour. Sometimes it finds solutions | |
# with norm_model <= norm_solution! So we check conditionally. | |
if norm_model < (1 + 1e-12) * norm_solution: | |
assert model.intercept_ == pytest.approx(intercept) | |
assert_allclose(model.coef_, coef, rtol=rtol) | |
elif solver == "lbfgs" and fit_intercept: | |
# But it is not the minimum norm solution. Otherwise the norms would be | |
# equal. | |
assert norm_model > (1 + 1e-12) * norm_solution | |
# See https://github.com/scikit-learn/scikit-learn/issues/23670. | |
# Note: Even adding a tiny penalty does not give the minimal norm solution. | |
# XXX: We could have naively expected LBFGS to find the minimal norm | |
# solution by adding a very small penalty. Even that fails for a reason we | |
# do not properly understand at this point. | |
else: | |
# When `fit_intercept=False`, LBFGS naturally converges to the minimum norm | |
# solution on this problem. | |
# XXX: Do we have any theoretical guarantees why this should be the case? | |
assert model.intercept_ == pytest.approx(intercept, rel=rtol) | |
assert_allclose(model.coef_, coef, rtol=rtol) | |
@pytest.mark.parametrize("solver", SOLVERS) | |
@pytest.mark.parametrize("fit_intercept", [True, False]) | |
def test_glm_regression_unpenalized_hstacked_X(solver, fit_intercept, glm_dataset): | |
"""Test that unpenalized GLM converges for all solvers to correct solution. | |
We work with a simple constructed data set with known solution. | |
GLM fit on [X] is the same as fit on [X, X]/2. | |
For long X, [X, X] is a singular matrix and we check against the minimum norm | |
solution: | |
min ||w||_2 subject to w = argmin deviance(X, y, w) | |
""" | |
model, X, y, coef, _, _, _ = glm_dataset | |
n_samples, n_features = X.shape | |
alpha = 0 # unpenalized | |
params = dict( | |
alpha=alpha, | |
fit_intercept=fit_intercept, | |
solver=solver, | |
gradient_tol=1e-12, | |
max_iter=1000, | |
) | |
model = clone(model).set_params(**params) | |
if fit_intercept: | |
intercept = coef[-1] | |
coef = coef[:-1] | |
if n_samples > n_features: | |
X = X[:, :-1] # remove intercept | |
X = 0.5 * np.concatenate((X, X), axis=1) | |
else: | |
# To know the minimum norm solution, we keep one intercept column and do | |
# not divide by 2. Later on, we must take special care. | |
X = np.c_[X[:, :-1], X[:, :-1], X[:, -1]] | |
else: | |
intercept = 0 | |
X = 0.5 * np.concatenate((X, X), axis=1) | |
assert np.linalg.matrix_rank(X) <= min(n_samples, n_features) | |
with warnings.catch_warnings(): | |
if solver.startswith("newton"): | |
# The newton solvers should warn and automatically fallback to LBFGS | |
# in this case. The model should still converge. | |
warnings.filterwarnings("ignore", category=scipy.linalg.LinAlgWarning) | |
# XXX: Investigate if the ConvergenceWarning that can appear in some | |
# cases should be considered a bug or not. In the mean time we don't | |
# fail when the assertions below pass irrespective of the presence of | |
# the warning. | |
warnings.filterwarnings("ignore", category=ConvergenceWarning) | |
model.fit(X, y) | |
if fit_intercept and n_samples < n_features: | |
# Here we take special care. | |
model_intercept = 2 * model.intercept_ | |
model_coef = 2 * model.coef_[:-1] # exclude the other intercept term. | |
# For minimum norm solution, we would have | |
# assert model.intercept_ == pytest.approx(model.coef_[-1]) | |
else: | |
model_intercept = model.intercept_ | |
model_coef = model.coef_ | |
if n_samples > n_features: | |
assert model_intercept == pytest.approx(intercept) | |
rtol = 1e-4 | |
assert_allclose(model_coef, np.r_[coef, coef], rtol=rtol) | |
else: | |
# As it is an underdetermined problem, prediction = y. The following shows that | |
# we get a solution, i.e. a (non-unique) minimum of the objective function ... | |
rtol = 1e-6 if solver == "lbfgs" else 5e-6 | |
assert_allclose(model.predict(X), y, rtol=rtol) | |
if (solver == "lbfgs" and fit_intercept) or solver == "newton-cholesky": | |
# Same as in test_glm_regression_unpenalized. | |
# But it is not the minimum norm solution. Otherwise the norms would be | |
# equal. | |
norm_solution = np.linalg.norm( | |
0.5 * np.r_[intercept, intercept, coef, coef] | |
) | |
norm_model = np.linalg.norm(np.r_[model.intercept_, model.coef_]) | |
assert norm_model > (1 + 1e-12) * norm_solution | |
# For minimum norm solution, we would have | |
# assert model.intercept_ == pytest.approx(model.coef_[-1]) | |
else: | |
assert model_intercept == pytest.approx(intercept, rel=5e-6) | |
assert_allclose(model_coef, np.r_[coef, coef], rtol=1e-4) | |
@pytest.mark.parametrize("solver", SOLVERS) | |
@pytest.mark.parametrize("fit_intercept", [True, False]) | |
def test_glm_regression_unpenalized_vstacked_X(solver, fit_intercept, glm_dataset): | |
"""Test that unpenalized GLM converges for all solvers to correct solution. | |
We work with a simple constructed data set with known solution. | |
GLM fit on [X] is the same as fit on [X], [y] | |
[X], [y]. | |
For wide X, [X', X'] is a singular matrix and we check against the minimum norm | |
solution: | |
min ||w||_2 subject to w = argmin deviance(X, y, w) | |
""" | |
model, X, y, coef, _, _, _ = glm_dataset | |
n_samples, n_features = X.shape | |
alpha = 0 # unpenalized | |
params = dict( | |
alpha=alpha, | |
fit_intercept=fit_intercept, | |
solver=solver, | |
gradient_tol=1e-12, | |
max_iter=1000, | |
) | |
model = clone(model).set_params(**params) | |
if fit_intercept: | |
X = X[:, :-1] # remove intercept | |
intercept = coef[-1] | |
coef = coef[:-1] | |
else: | |
intercept = 0 | |
X = np.concatenate((X, X), axis=0) | |
assert np.linalg.matrix_rank(X) <= min(n_samples, n_features) | |
y = np.r_[y, y] | |
with warnings.catch_warnings(): | |
if solver.startswith("newton") and n_samples < n_features: | |
# The newton solvers should warn and automatically fallback to LBFGS | |
# in this case. The model should still converge. | |
warnings.filterwarnings("ignore", category=scipy.linalg.LinAlgWarning) | |
# XXX: Investigate if the ConvergenceWarning that can appear in some | |
# cases should be considered a bug or not. In the mean time we don't | |
# fail when the assertions below pass irrespective of the presence of | |
# the warning. | |
warnings.filterwarnings("ignore", category=ConvergenceWarning) | |
model.fit(X, y) | |
if n_samples > n_features: | |
rtol = 5e-5 if solver == "lbfgs" else 1e-6 | |
assert model.intercept_ == pytest.approx(intercept) | |
assert_allclose(model.coef_, coef, rtol=rtol) | |
else: | |
# As it is an underdetermined problem, prediction = y. The following shows that | |
# we get a solution, i.e. a (non-unique) minimum of the objective function ... | |
rtol = 1e-6 if solver == "lbfgs" else 5e-6 | |
assert_allclose(model.predict(X), y, rtol=rtol) | |
norm_solution = np.linalg.norm(np.r_[intercept, coef]) | |
norm_model = np.linalg.norm(np.r_[model.intercept_, model.coef_]) | |
if solver == "newton-cholesky": | |
# XXX: This solver shows random behaviour. Sometimes it finds solutions | |
# with norm_model <= norm_solution! So we check conditionally. | |
if not (norm_model > (1 + 1e-12) * norm_solution): | |
assert model.intercept_ == pytest.approx(intercept) | |
assert_allclose(model.coef_, coef, rtol=1e-4) | |
elif solver == "lbfgs" and fit_intercept: | |
# Same as in test_glm_regression_unpenalized. | |
# But it is not the minimum norm solution. Otherwise the norms would be | |
# equal. | |
assert norm_model > (1 + 1e-12) * norm_solution | |
else: | |
rtol = 1e-5 if solver == "newton-cholesky" else 1e-4 | |
assert model.intercept_ == pytest.approx(intercept, rel=rtol) | |
assert_allclose(model.coef_, coef, rtol=rtol) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment