Last active
March 4, 2021 15:48
-
-
Save ogrisel/cf7ac8bca6725ec02d4cc8d03ce096a7 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# %% | |
import numpy as np | |
from numpy.random import sample | |
from numpy.testing import assert_allclose | |
from sklearn.preprocessing import scale | |
from sklearn.datasets import make_regression | |
from sklearn.model_selection import train_test_split | |
from scipy.optimize import fmin_l_bfgs_b, check_grad | |
rng = np.random.RandomState(42) | |
n_samples = 200 | |
X, y = make_regression( | |
n_samples=n_samples, | |
effective_rank=10, | |
n_features=300, | |
n_informative=50, | |
random_state=0, | |
) | |
X = scale(X) # to be able to use normalize=False safely | |
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) | |
# Check that setting the sw to 0 for the first samples is equivalent to | |
# removing them: | |
# sw = np.full(shape=X_train.shape[0], fill_value=.42) | |
sw = rng.uniform(low=0.1, high=2, size=X_train.shape[0]) | |
cutoff = X.shape[0] // 3 | |
sw[:cutoff] = 0. | |
X_train_trimmed = X_train[cutoff:] | |
y_train_trimmed = y_train[cutoff:] | |
sw_trimmed = sw[cutoff:] | |
alpha = 1. | |
def ridge_loss(coef, intercept, alpha, X, y, sample_weight=None): | |
if sample_weight is None: | |
sample_weight = np.ones(shape=X.shape[0], dtype=X.dtype) | |
return ( | |
0.5 * np.sum(sample_weight * (y - X @ coef - intercept) ** 2) + | |
0.5 * alpha * np.sum(coef ** 2) | |
) | |
def ridge_bfgs(X, y, alpha, sample_weight=None): | |
if sample_weight is None: | |
sample_weight = np.ones(shape=X.shape[0], dtype=X.dtype) | |
def f(w): | |
coef, intercept = w[:-1], w[-1] | |
return ridge_loss(coef, intercept, alpha, X, y, | |
sample_weight=sample_weight) | |
def fprime(w): | |
coef, intercept = w[:-1], w[-1] | |
residual = y - X @ coef - intercept | |
grad = np.empty(X.shape[1] + 1) | |
grad[:-1] = -X.T @ (sample_weight * residual) + alpha * coef | |
grad[-1] = -np.sum(sample_weight * residual) | |
return grad | |
dim = X.shape[1] + 1 | |
# print( | |
# "check_grad:", | |
# check_grad(f, fprime, np.random.RandomState(0).randn(dim)), | |
# ) | |
w0 = np.zeros(dim) | |
w0[-1] = np.average(y, weights=sample_weight) | |
# Set factr to a lower value than by default to get reach a higher | |
# quality convergence. | |
w = fmin_l_bfgs_b(f, w0, fprime, factr=10.)[0] | |
coef, intercept = w[:-1], w[-1] | |
return coef, intercept | |
# %% | |
# Sanity check w.r.t. sample weight invariance properties of the ridge_bfgs | |
# implementation. | |
coef, intercept = ridge_bfgs(X_train, y_train, alpha, sw) | |
coef_trimmed, intercept_trimmed = ridge_bfgs( | |
X_train_trimmed, y_train_trimmed, alpha, sw_trimmed) | |
assert_allclose(coef, coef_trimmed) | |
assert_allclose(intercept, intercept_trimmed) | |
coef_2sw, intercept_2sw = ridge_bfgs(X_train, y_train, alpha, 2 * sw) | |
coef_dup, intercept_dup = ridge_bfgs( | |
np.concatenate([X_train, X_train]), | |
np.concatenate([y_train, y_train]), | |
alpha, | |
np.concatenate([sw, sw]), | |
) | |
assert_allclose(coef_2sw, coef_dup) | |
assert_allclose(intercept_2sw, intercept_dup) | |
print("ridge_bfgs sample_weight invariance: ok") | |
# %% | |
# Check that we have converge to a minimum of the ridge_loss function | |
def check_min(coef, intercept, alpha, X, y, sample_weight=None, | |
noise_scale=1e-7, seed=0, n_checks=100): | |
# Check that applying a small random perturbation on the fitted | |
# causes the loss to increase. | |
if sample_weight is None: | |
sample_weight = np.ones(shape=X.shape[0], dtype=X.dtype) | |
rng = np.random.RandomState(seed) | |
model_loss = ridge_loss( | |
coef, | |
intercept, | |
alpha, | |
X, | |
y, | |
sample_weight=sample_weight, | |
) | |
noise_scale_range = np.logspace( | |
np.log10(noise_scale), np.log10(noise_scale) + 4, 4 | |
)[::-1] | |
for ns in noise_scale_range: | |
correct_checks = [] | |
for _ in range(n_checks): | |
perturbation = ns * rng.normal(size=len(coef)) | |
perturbed_loss = ridge_loss( | |
coef + perturbation, | |
intercept + ns * rng.normal(size=1), | |
alpha, | |
X, | |
y, | |
sample_weight=sample_weight, | |
) | |
correct_checks.append( | |
perturbed_loss > model_loss - np.finfo(X.dtype).eps | |
) | |
if np.mean(correct_checks) < 0.99: | |
raise AssertionError( | |
f"Only {np.sum(correct_checks)}/{n_checks} passed at " | |
f"noise scale {ns:.3e}." | |
) | |
check_min(coef, intercept, alpha, X_train, y_train, sw, noise_scale=1e-7) | |
check_min(coef_2sw, intercept_2sw, alpha, X_train, y_train, 2 * sw, | |
noise_scale=1e-7) | |
# %% | |
# Check consistency with Ridge(normalize=False) | |
from sklearn.linear_model import Ridge | |
solver = "cholesky" | |
solver_tol = 1e-12 # only used for some solvers | |
ridge_sklearn = Ridge( | |
alpha=alpha, | |
normalize=False, | |
solver=solver, | |
tol=solver_tol, | |
).fit(X_train, y_train, sample_weight=sw) | |
check_min(ridge_sklearn.coef_, ridge_sklearn.intercept_, alpha, | |
X_train, y_train, sw, noise_scale=1e-9) | |
atol = 1e-4 * np.median(np.abs(coef)) | |
assert_allclose(ridge_sklearn.coef_, coef, atol=atol) | |
assert_allclose(ridge_sklearn.intercept_, intercept, atol=atol) | |
# %% | |
# Check consistency with Ridge(normalize=False) | |
ridge_sklearn_2sw = Ridge( | |
alpha=alpha, | |
normalize=False, | |
solver=solver, | |
tol=solver_tol, | |
).fit(X_train, y_train, sample_weight=2 * sw) | |
check_min(ridge_sklearn_2sw.coef_, ridge_sklearn_2sw.intercept_, alpha, | |
X_train, y_train, 2 * sw, noise_scale=1e-9) | |
atol = 1e-4 * np.median(np.abs(coef_2sw)) | |
assert_allclose(ridge_sklearn_2sw.coef_, coef_2sw, atol=atol) | |
assert_allclose(ridge_sklearn_2sw.intercept_, intercept_2sw, atol=atol) | |
# %% | |
# Large-ish scale benchmarks | |
rng = np.random.RandomState(42) | |
X_large, y_large = make_regression( | |
n_samples=int(5e3), | |
effective_rank=100, | |
n_features=int(2e3), | |
n_informative=50, | |
random_state=0, | |
) | |
X = scale(X) | |
#%% | |
alpha = 1e-6 | |
# %% | |
%%time | |
coef_large, intercept_large = ridge_bfgs(X_large, y_large, alpha) | |
# %% | |
check_min(coef_large, intercept_large, | |
alpha, X_large, y_large, noise_scale=1e-6) | |
# %% | |
%%time | |
ridge_sklearn_large = Ridge( | |
alpha=alpha, | |
normalize=False, | |
solver="cholesky", | |
tol=solver_tol, | |
).fit(X_large, y_large) | |
# %% | |
check_min(ridge_sklearn_large.coef_, ridge_sklearn_large.intercept_, | |
alpha, X_large, y_large, noise_scale=1e-12) | |
# %% | |
atol = 1e-2 * np.median(np.abs(coef)) | |
assert_allclose(coef_large, ridge_sklearn_large.coef_, atol=atol) | |
# %% |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment