Created
July 8, 2017 04:27
-
-
Save pancodia/a1e76afd12b2d93af7ddabe53a55680a to your computer and use it in GitHub Desktop.
RidgeCV gives a different result from running Ridge with manually implemented CV
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
Python 3.6.1 |Anaconda 4.4.0 (x86_64)| (default, May 11 2017, 13:04:09) | |
In [1]: import sklearn | |
In [2]: print(sklearn.__version__) | |
0.18.1 | |
''' | |
import pandas as pd | |
from sklearn.model_selection import KFold | |
from sklearn.linear_model import Ridge, RidgeCV | |
import numpy as np | |
from operator import itemgetter | |
# Load data (Hitters dataset) | |
hitters_df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Hitters.csv') | |
hitters_clean_df = hitters_df.dropna() | |
hitters_clean_df = pd.get_dummies(hitters_clean_df, drop_first=True) | |
# Prepare the predictors and response for regression | |
X = hitters_clean_df.drop('Salary', axis = 1) | |
y = hitters_clean_df.Salary | |
# Method 1: Using RidgeCV | |
alphas = 10**np.linspace(-4, 2, 100) | |
regr = RidgeCV(alphas=alphas, cv=10, normalize=True, scoring='neg_mean_squared_error') | |
regr.fit(X.values, y.values) | |
print('Best alpha from RidgeCV:', regr.alpha_) # ('Best alpha from RidgeCV:', 18.738174228603832) | |
# Method 2: Prepare data with KFold and apply Ridge for each fold | |
alphas = 10**np.linspace(-4, 2, 100) | |
K = 10 # Number of folds | |
kf = KFold(n_splits=K, shuffle=True, random_state=123) | |
cv_scores = [] # to store CV scores | |
for alpha in alphas: | |
for train_idx, test_idx in kf.split(X.values, y.values): # kf is an iterator, which you can loop over | |
X_train = X.values[train_idx] | |
y_train = y.values[train_idx] | |
X_test = X.values[test_idx] | |
y_test = y.values[test_idx] | |
model = Ridge(alpha=alpha, normalize=True, fit_intercept=True).fit(X_train, y_train) | |
test_errors = model.predict(X_test) - y_test | |
cv_scores.append( np.mean(test_errors**2) ) # mean squared error as the CV score | |
min_idx, min_cv_score = min( enumerate(cv_scores), key=itemgetter(1) ) | |
best_alpha = alphas[min_idx] | |
print('Best alpha from KFold+Ridge: ', best_alpha) # ('Best alpha from KFold+Ridge: ', 0.011497569953977356) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment