Last active
December 4, 2015 07:48
-
-
Save zhangqiaorjc/4b2b85cf2e8abea2d17f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
from sklearn.preprocessing import scale | |
from sklearn import linear_model | |
from itertools import chain, combinations | |
from sklearn.cross_validation import cross_val_score | |
prost = pd.read_csv("http://statweb.stanford.edu/~tibs/ElemStatLearn/datasets/prostate.data", sep="\t", usecols=range(1,11)) | |
prost_std = pd.DataFrame(scale(prost.ix[:,0:8]), index = prost.index, columns = prost.columns[0:8]) | |
# Scale data and prepare train/test split | |
X_train = prost_std.loc[prost['train'] == 'T'].as_matrix() | |
X_test = prost_std.loc[prost['train'] == 'F'].as_matrix() | |
y_train = prost.loc[prost['train'] == 'T']['lpsa'].as_matrix() | |
y_test = prost.loc[prost['train'] == 'F']['lpsa'].as_matrix() | |
#################################### | |
## Ordinary least squares | |
#################################### | |
ols = linear_model.LinearRegression() | |
ols.fit(X_train, y_train) | |
y_pred_ols = ols.predict(X_test) | |
print ols.coef_ | |
ls_sq_errors = (y_pred_ols - y_test) ** 2 | |
print np.mean(ls_sq_errors) | |
print np.sqrt(np.var(ls_sq_errors) / len(y_test)) | |
#################################### | |
## Subset Selection | |
#################################### | |
n_features = X_train.shape[1] | |
subsets = chain.from_iterable(combinations(range(n_features), k+1) for k in range(n_features + 1)) | |
lr = linear_model.LinearRegression() | |
best_score = -np.inf | |
for subset in subsets: | |
score = np.mean(cross_val_score(lr, X_train[:,subset], y_train, cv=3)) | |
if score > best_score: | |
best_score = score | |
best_subset = subset | |
lr.fit(X_train[:,best_subset], y_train) | |
y_pred_lr = lr.predict(X_test[:,best_subset]) | |
best_subset_coef = np.zeros(n_features) | |
best_subset_coef[list(best_subset)] = lr.coef_ | |
print best_subset_coef | |
ls_sq_errors = (y_pred_lr - y_test) ** 2 | |
print np.mean(ls_sq_errors) | |
print np.sqrt(np.var(ls_sq_errors) / len(y_test)) | |
#################################### | |
## Ridge regression | |
#################################### | |
ridge = linear_model.RidgeCV(cv=3) | |
ridge.fit(X_train, y_train) | |
print ridge.coef_ | |
y_pred_ridge = ridge.predict(X_test) | |
ls_sq_errors = (y_pred_ridge - y_test) ** 2 | |
print np.mean(ls_sq_errors) | |
print np.sqrt(np.var(ls_sq_errors) / len(y_test)) | |
#################################### | |
## Lasso regression | |
#################################### | |
lasso = linear_model.LassoCV() | |
lasso.fit(X_train, y_train) | |
print lasso.coef_ | |
y_pred_lasso = lasso.predict(X_test) | |
ls_sq_errors = (y_pred_lasso - y_test) ** 2 | |
print np.mean(ls_sq_errors) | |
print np.sqrt(np.var(ls_sq_errors) / len(y_test)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment