ogrisel/ridge_normalize_false.py

## ridge_normalize_false.py
# %%
import numpy as np
from numpy.random import sample
from numpy.testing import assert_allclose
from sklearn.preprocessing import scale
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from scipy.optimize import fmin_l_bfgs_b, check_grad


rng = np.random.RandomState(42)

n_samples = 200
X, y = make_regression(
    n_samples=n_samples,
    effective_rank=10,
    n_features=300,
    n_informative=50,
    random_state=0,
)
X = scale(X)  # to be able to use normalize=False safely
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# Check that setting the sw to 0 for the first samples is equivalent to
# removing them:
# sw = np.full(shape=X_train.shape[0], fill_value=.42)
sw = rng.uniform(low=0.1, high=2, size=X_train.shape[0])
cutoff = X.shape[0] // 3
sw[:cutoff] = 0.
X_train_trimmed = X_train[cutoff:]
y_train_trimmed = y_train[cutoff:]
sw_trimmed = sw[cutoff:]

alpha = 1.


def ridge_loss(coef, intercept, alpha, X, y, sample_weight=None):
    if sample_weight is None:
        sample_weight = np.ones(shape=X.shape[0], dtype=X.dtype)
    return (
        0.5 * np.sum(sample_weight * (y - X @ coef - intercept) ** 2) +
        0.5 * alpha * np.sum(coef ** 2)
    )


def ridge_bfgs(X, y, alpha, sample_weight=None):
    if sample_weight is None:
        sample_weight = np.ones(shape=X.shape[0], dtype=X.dtype)

    def f(w):
        coef, intercept = w[:-1], w[-1]
        return ridge_loss(coef, intercept, alpha, X, y,
                          sample_weight=sample_weight)

    def fprime(w):
        coef, intercept = w[:-1], w[-1]
        residual = y - X @ coef - intercept
        grad = np.empty(X.shape[1] + 1)
        grad[:-1] = -X.T @ (sample_weight * residual) + alpha * coef
        grad[-1] = -np.sum(sample_weight * residual)
        return grad

    dim = X.shape[1] + 1
    # print(
    #     "check_grad:",
    #     check_grad(f, fprime, np.random.RandomState(0).randn(dim)),
    # )
    w0 = np.zeros(dim)
    w0[-1] = np.average(y, weights=sample_weight)

    # Set factr to a lower value than by default to get reach a higher
    # quality convergence.
    w = fmin_l_bfgs_b(f, w0, fprime, factr=10.)[0]
    coef, intercept = w[:-1], w[-1]
    return coef, intercept


# %%
# Sanity check w.r.t. sample weight invariance properties of the ridge_bfgs
# implementation.
coef, intercept = ridge_bfgs(X_train, y_train, alpha, sw)
coef_trimmed, intercept_trimmed = ridge_bfgs(
    X_train_trimmed, y_train_trimmed, alpha, sw_trimmed)

assert_allclose(coef, coef_trimmed)
assert_allclose(intercept, intercept_trimmed)

coef_2sw, intercept_2sw = ridge_bfgs(X_train, y_train, alpha, 2 * sw)
coef_dup, intercept_dup = ridge_bfgs(
    np.concatenate([X_train, X_train]),
    np.concatenate([y_train, y_train]),
    alpha,
    np.concatenate([sw, sw]),
)

assert_allclose(coef_2sw, coef_dup)
assert_allclose(intercept_2sw, intercept_dup)
print("ridge_bfgs sample_weight invariance: ok")


# %%
# Check that we have converge to a minimum of the ridge_loss function
def check_min(coef, intercept, alpha, X, y, sample_weight=None,
              noise_scale=1e-7, seed=0, n_checks=100):
    # Check that applying a small random perturbation on the fitted
    # causes the loss to increase.
    if sample_weight is None:
        sample_weight = np.ones(shape=X.shape[0], dtype=X.dtype)

    rng = np.random.RandomState(seed)
    model_loss = ridge_loss(
        coef,
        intercept,
        alpha,
        X,
        y,
        sample_weight=sample_weight,
    )
    noise_scale_range = np.logspace(
        np.log10(noise_scale), np.log10(noise_scale) + 4, 4
    )[::-1]
    for ns in noise_scale_range:
        correct_checks = []
        for _ in range(n_checks):
            perturbation = ns * rng.normal(size=len(coef))
            perturbed_loss = ridge_loss(
                coef + perturbation,
                intercept + ns * rng.normal(size=1),
                alpha,
                X,
                y,
                sample_weight=sample_weight,
            )
            correct_checks.append(
                perturbed_loss > model_loss - np.finfo(X.dtype).eps
            )
        if np.mean(correct_checks) < 0.99:
            raise AssertionError(
                f"Only {np.sum(correct_checks)}/{n_checks} passed at "
                f"noise scale {ns:.3e}."
            )


check_min(coef, intercept, alpha, X_train, y_train, sw, noise_scale=1e-7)
check_min(coef_2sw, intercept_2sw, alpha, X_train, y_train, 2 * sw,
          noise_scale=1e-7)


# %%
# Check consistency with Ridge(normalize=False)
from sklearn.linear_model import Ridge

solver = "cholesky"
solver_tol = 1e-12   # only used for some solvers
ridge_sklearn = Ridge(
    alpha=alpha,
    normalize=False,
    solver=solver,
    tol=solver_tol,
).fit(X_train, y_train, sample_weight=sw)

check_min(ridge_sklearn.coef_, ridge_sklearn.intercept_, alpha,
          X_train, y_train, sw, noise_scale=1e-9)

atol = 1e-4 * np.median(np.abs(coef))
assert_allclose(ridge_sklearn.coef_, coef, atol=atol)
assert_allclose(ridge_sklearn.intercept_, intercept, atol=atol)

# %%
# Check consistency with Ridge(normalize=False)
ridge_sklearn_2sw = Ridge(
    alpha=alpha,
    normalize=False,
    solver=solver,
    tol=solver_tol,
).fit(X_train, y_train, sample_weight=2 * sw)

check_min(ridge_sklearn_2sw.coef_, ridge_sklearn_2sw.intercept_, alpha,
          X_train, y_train, 2 * sw, noise_scale=1e-9)

atol = 1e-4 * np.median(np.abs(coef_2sw))
assert_allclose(ridge_sklearn_2sw.coef_, coef_2sw, atol=atol)
assert_allclose(ridge_sklearn_2sw.intercept_, intercept_2sw, atol=atol)


# %%
# Large-ish scale benchmarks
rng = np.random.RandomState(42)

X_large, y_large = make_regression(
    n_samples=int(5e3),
    effective_rank=100,
    n_features=int(2e3),
    n_informative=50,
    random_state=0,
)
X = scale(X)

#%%
alpha = 1e-6

# %%
%%time
coef_large, intercept_large = ridge_bfgs(X_large, y_large, alpha)

# %%
check_min(coef_large, intercept_large,
          alpha, X_large, y_large, noise_scale=1e-6)

# %%
%%time
ridge_sklearn_large = Ridge(
    alpha=alpha,
    normalize=False,
    solver="cholesky",
    tol=solver_tol,
).fit(X_large, y_large)

# %%
check_min(ridge_sklearn_large.coef_, ridge_sklearn_large.intercept_,
          alpha, X_large, y_large, noise_scale=1e-12)
# %%
atol = 1e-2 * np.median(np.abs(coef))
assert_allclose(coef_large, ridge_sklearn_large.coef_, atol=atol)


# %%
	# %%
	import numpy as np
	from numpy.random import sample
	from numpy.testing import assert_allclose
	from sklearn.preprocessing import scale
	from sklearn.datasets import make_regression
	from sklearn.model_selection import train_test_split
	from scipy.optimize import fmin_l_bfgs_b, check_grad


	rng = np.random.RandomState(42)

	n_samples = 200
	X, y = make_regression(
	n_samples=n_samples,
	effective_rank=10,
	n_features=300,
	n_informative=50,
	random_state=0,
	)
	X = scale(X) # to be able to use normalize=False safely
	X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

	# Check that setting the sw to 0 for the first samples is equivalent to
	# removing them:
	# sw = np.full(shape=X_train.shape[0], fill_value=.42)
	sw = rng.uniform(low=0.1, high=2, size=X_train.shape[0])
	cutoff = X.shape[0] // 3
	sw[:cutoff] = 0.
	X_train_trimmed = X_train[cutoff:]
	y_train_trimmed = y_train[cutoff:]
	sw_trimmed = sw[cutoff:]

	alpha = 1.


	def ridge_loss(coef, intercept, alpha, X, y, sample_weight=None):
	if sample_weight is None:
	sample_weight = np.ones(shape=X.shape[0], dtype=X.dtype)
	return (
	0.5 * np.sum(sample_weight * (y - X @ coef - intercept) ** 2) +
	0.5 * alpha * np.sum(coef ** 2)
	)


	def ridge_bfgs(X, y, alpha, sample_weight=None):
	if sample_weight is None:
	sample_weight = np.ones(shape=X.shape[0], dtype=X.dtype)

	def f(w):
	coef, intercept = w[:-1], w[-1]
	return ridge_loss(coef, intercept, alpha, X, y,
	sample_weight=sample_weight)

	def fprime(w):
	coef, intercept = w[:-1], w[-1]
	residual = y - X @ coef - intercept
	grad = np.empty(X.shape[1] + 1)
	grad[:-1] = -X.T @ (sample_weight * residual) + alpha * coef
	grad[-1] = -np.sum(sample_weight * residual)
	return grad

	dim = X.shape[1] + 1
	# print(
	# "check_grad:",
	# check_grad(f, fprime, np.random.RandomState(0).randn(dim)),
	# )
	w0 = np.zeros(dim)
	w0[-1] = np.average(y, weights=sample_weight)

	# Set factr to a lower value than by default to get reach a higher
	# quality convergence.
	w = fmin_l_bfgs_b(f, w0, fprime, factr=10.)[0]
	coef, intercept = w[:-1], w[-1]
	return coef, intercept


	# %%
	# Sanity check w.r.t. sample weight invariance properties of the ridge_bfgs
	# implementation.
	coef, intercept = ridge_bfgs(X_train, y_train, alpha, sw)
	coef_trimmed, intercept_trimmed = ridge_bfgs(
	X_train_trimmed, y_train_trimmed, alpha, sw_trimmed)

	assert_allclose(coef, coef_trimmed)
	assert_allclose(intercept, intercept_trimmed)

	coef_2sw, intercept_2sw = ridge_bfgs(X_train, y_train, alpha, 2 * sw)
	coef_dup, intercept_dup = ridge_bfgs(
	np.concatenate([X_train, X_train]),
	np.concatenate([y_train, y_train]),
	alpha,
	np.concatenate([sw, sw]),
	)

	assert_allclose(coef_2sw, coef_dup)
	assert_allclose(intercept_2sw, intercept_dup)
	print("ridge_bfgs sample_weight invariance: ok")


	# %%
	# Check that we have converge to a minimum of the ridge_loss function
	def check_min(coef, intercept, alpha, X, y, sample_weight=None,
	noise_scale=1e-7, seed=0, n_checks=100):
	# Check that applying a small random perturbation on the fitted
	# causes the loss to increase.
	if sample_weight is None:
	sample_weight = np.ones(shape=X.shape[0], dtype=X.dtype)

	rng = np.random.RandomState(seed)
	model_loss = ridge_loss(
	coef,
	intercept,
	alpha,
	X,
	y,
	sample_weight=sample_weight,
	)
	noise_scale_range = np.logspace(
	np.log10(noise_scale), np.log10(noise_scale) + 4, 4
	)[::-1]
	for ns in noise_scale_range:
	correct_checks = []
	for _ in range(n_checks):
	perturbation = ns * rng.normal(size=len(coef))
	perturbed_loss = ridge_loss(
	coef + perturbation,
	intercept + ns * rng.normal(size=1),
	alpha,
	X,
	y,
	sample_weight=sample_weight,
	)
	correct_checks.append(
	perturbed_loss > model_loss - np.finfo(X.dtype).eps
	)
	if np.mean(correct_checks) < 0.99:
	raise AssertionError(
	f"Only {np.sum(correct_checks)}/{n_checks} passed at "
	f"noise scale {ns:.3e}."
	)


	check_min(coef, intercept, alpha, X_train, y_train, sw, noise_scale=1e-7)
	check_min(coef_2sw, intercept_2sw, alpha, X_train, y_train, 2 * sw,
	noise_scale=1e-7)


	# %%
	# Check consistency with Ridge(normalize=False)
	from sklearn.linear_model import Ridge

	solver = "cholesky"
	solver_tol = 1e-12 # only used for some solvers
	ridge_sklearn = Ridge(
	alpha=alpha,
	normalize=False,
	solver=solver,
	tol=solver_tol,
	).fit(X_train, y_train, sample_weight=sw)

	check_min(ridge_sklearn.coef_, ridge_sklearn.intercept_, alpha,
	X_train, y_train, sw, noise_scale=1e-9)

	atol = 1e-4 * np.median(np.abs(coef))
	assert_allclose(ridge_sklearn.coef_, coef, atol=atol)
	assert_allclose(ridge_sklearn.intercept_, intercept, atol=atol)

	# %%
	# Check consistency with Ridge(normalize=False)
	ridge_sklearn_2sw = Ridge(
	alpha=alpha,
	normalize=False,
	solver=solver,
	tol=solver_tol,
	).fit(X_train, y_train, sample_weight=2 * sw)

	check_min(ridge_sklearn_2sw.coef_, ridge_sklearn_2sw.intercept_, alpha,
	X_train, y_train, 2 * sw, noise_scale=1e-9)

	atol = 1e-4 * np.median(np.abs(coef_2sw))
	assert_allclose(ridge_sklearn_2sw.coef_, coef_2sw, atol=atol)
	assert_allclose(ridge_sklearn_2sw.intercept_, intercept_2sw, atol=atol)


	# %%
	# Large-ish scale benchmarks
	rng = np.random.RandomState(42)

	X_large, y_large = make_regression(
	n_samples=int(5e3),
	effective_rank=100,
	n_features=int(2e3),
	n_informative=50,
	random_state=0,
	)
	X = scale(X)

	#%%
	alpha = 1e-6

	# %%
	%%time
	coef_large, intercept_large = ridge_bfgs(X_large, y_large, alpha)

	# %%
	check_min(coef_large, intercept_large,
	alpha, X_large, y_large, noise_scale=1e-6)

	# %%
	%%time
	ridge_sklearn_large = Ridge(
	alpha=alpha,
	normalize=False,
	solver="cholesky",
	tol=solver_tol,
	).fit(X_large, y_large)

	# %%
	check_min(ridge_sklearn_large.coef_, ridge_sklearn_large.intercept_,
	alpha, X_large, y_large, noise_scale=1e-12)
	# %%
	atol = 1e-2 * np.median(np.abs(coef))
	assert_allclose(coef_large, ridge_sklearn_large.coef_, atol=atol)


	# %%