Skip to content

Instantly share code, notes, and snippets.

@zhangqiaorjc
Last active December 4, 2015 07:48
Show Gist options
  • Save zhangqiaorjc/4b2b85cf2e8abea2d17f to your computer and use it in GitHub Desktop.
Save zhangqiaorjc/4b2b85cf2e8abea2d17f to your computer and use it in GitHub Desktop.
import pandas as pd
import numpy as np
from sklearn.preprocessing import scale
from sklearn import linear_model
from itertools import chain, combinations
from sklearn.cross_validation import cross_val_score
prost = pd.read_csv("http://statweb.stanford.edu/~tibs/ElemStatLearn/datasets/prostate.data", sep="\t", usecols=range(1,11))
prost_std = pd.DataFrame(scale(prost.ix[:,0:8]), index = prost.index, columns = prost.columns[0:8])
# Scale data and prepare train/test split
X_train = prost_std.loc[prost['train'] == 'T'].as_matrix()
X_test = prost_std.loc[prost['train'] == 'F'].as_matrix()
y_train = prost.loc[prost['train'] == 'T']['lpsa'].as_matrix()
y_test = prost.loc[prost['train'] == 'F']['lpsa'].as_matrix()
####################################
## Ordinary least squares
####################################
ols = linear_model.LinearRegression()
ols.fit(X_train, y_train)
y_pred_ols = ols.predict(X_test)
print ols.coef_
ls_sq_errors = (y_pred_ols - y_test) ** 2
print np.mean(ls_sq_errors)
print np.sqrt(np.var(ls_sq_errors) / len(y_test))
####################################
## Subset Selection
####################################
n_features = X_train.shape[1]
subsets = chain.from_iterable(combinations(range(n_features), k+1) for k in range(n_features + 1))
lr = linear_model.LinearRegression()
best_score = -np.inf
for subset in subsets:
score = np.mean(cross_val_score(lr, X_train[:,subset], y_train, cv=3))
if score > best_score:
best_score = score
best_subset = subset
lr.fit(X_train[:,best_subset], y_train)
y_pred_lr = lr.predict(X_test[:,best_subset])
best_subset_coef = np.zeros(n_features)
best_subset_coef[list(best_subset)] = lr.coef_
print best_subset_coef
ls_sq_errors = (y_pred_lr - y_test) ** 2
print np.mean(ls_sq_errors)
print np.sqrt(np.var(ls_sq_errors) / len(y_test))
####################################
## Ridge regression
####################################
ridge = linear_model.RidgeCV(cv=3)
ridge.fit(X_train, y_train)
print ridge.coef_
y_pred_ridge = ridge.predict(X_test)
ls_sq_errors = (y_pred_ridge - y_test) ** 2
print np.mean(ls_sq_errors)
print np.sqrt(np.var(ls_sq_errors) / len(y_test))
####################################
## Lasso regression
####################################
lasso = linear_model.LassoCV()
lasso.fit(X_train, y_train)
print lasso.coef_
y_pred_lasso = lasso.predict(X_test)
ls_sq_errors = (y_pred_lasso - y_test) ** 2
print np.mean(ls_sq_errors)
print np.sqrt(np.var(ls_sq_errors) / len(y_test))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment