Skip to content

Instantly share code, notes, and snippets.

@pancodia
Created July 8, 2017 04:27
Show Gist options
  • Save pancodia/a1e76afd12b2d93af7ddabe53a55680a to your computer and use it in GitHub Desktop.
Save pancodia/a1e76afd12b2d93af7ddabe53a55680a to your computer and use it in GitHub Desktop.
RidgeCV gives a different result from running Ridge with manually implemented CV
'''
Python 3.6.1 |Anaconda 4.4.0 (x86_64)| (default, May 11 2017, 13:04:09)
In [1]: import sklearn
In [2]: print(sklearn.__version__)
0.18.1
'''
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge, RidgeCV
import numpy as np
from operator import itemgetter
# Load data (Hitters dataset)
hitters_df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Hitters.csv')
hitters_clean_df = hitters_df.dropna()
hitters_clean_df = pd.get_dummies(hitters_clean_df, drop_first=True)
# Prepare the predictors and response for regression
X = hitters_clean_df.drop('Salary', axis = 1)
y = hitters_clean_df.Salary
# Method 1: Using RidgeCV
alphas = 10**np.linspace(-4, 2, 100)
regr = RidgeCV(alphas=alphas, cv=10, normalize=True, scoring='neg_mean_squared_error')
regr.fit(X.values, y.values)
print('Best alpha from RidgeCV:', regr.alpha_) # ('Best alpha from RidgeCV:', 18.738174228603832)
# Method 2: Prepare data with KFold and apply Ridge for each fold
alphas = 10**np.linspace(-4, 2, 100)
K = 10 # Number of folds
kf = KFold(n_splits=K, shuffle=True, random_state=123)
cv_scores = [] # to store CV scores
for alpha in alphas:
for train_idx, test_idx in kf.split(X.values, y.values): # kf is an iterator, which you can loop over
X_train = X.values[train_idx]
y_train = y.values[train_idx]
X_test = X.values[test_idx]
y_test = y.values[test_idx]
model = Ridge(alpha=alpha, normalize=True, fit_intercept=True).fit(X_train, y_train)
test_errors = model.predict(X_test) - y_test
cv_scores.append( np.mean(test_errors**2) ) # mean squared error as the CV score
min_idx, min_cv_score = min( enumerate(cv_scores), key=itemgetter(1) )
best_alpha = alphas[min_idx]
print('Best alpha from KFold+Ridge: ', best_alpha) # ('Best alpha from KFold+Ridge: ', 0.011497569953977356)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment