Skip to content

Instantly share code, notes, and snippets.

@ShivendraAgrawal
Last active December 12, 2016 18:31
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ShivendraAgrawal/f37bfc2e824cf9dd241aaa43a59749e1 to your computer and use it in GitHub Desktop.
Save ShivendraAgrawal/f37bfc2e824cf9dd241aaa43a59749e1 to your computer and use it in GitHub Desktop.
Scikit-Learn sample snippets for copy-paste
SOURCE http://blog.datadive.net/selecting-good-features-part-ii-linear-models-and-regularization/
# Correlation
import numpy as np
from scipy.stats import pearsonr
np.random.seed(0)
size = 300
x = np.random.normal(0, 1, size)
print "Lower noise", pearsonr(x, x + np.random.normal(0, 1, size))
print "Higher noise", pearsonr(x, x + np.random.normal(0, 10, size))
#Feature selection Random forrest
from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestRegressor
import numpy as np
#Load boston housing dataset as an example
boston = load_boston()
X = boston["data"]
Y = boston["target"]
names = boston["feature_names"]
rf = RandomForestRegressor()
rf.fit(X, Y)
print "Features sorted by their score:"
print sorted(zip(map(lambda x: round(x, 4), rf.feature_importances_), names),
reverse=True)
#Shuffle split and feature selection
from sklearn.cross_validation import ShuffleSplit
from sklearn.metrics import r2_score
from collections import defaultdict
X = boston["data"]
Y = boston["target"]
rf = RandomForestRegressor()
scores = defaultdict(list)
#crossvalidate the scores on a number of different random splits of the data
for train_idx, test_idx in ShuffleSplit(len(X), 100, .3):
X_train, X_test = X[train_idx], X[test_idx]
Y_train, Y_test = Y[train_idx], Y[test_idx]
r = rf.fit(X_train, Y_train)
acc = r2_score(Y_test, rf.predict(X_test))
for i in range(X.shape[1]):
X_t = X_test.copy()
np.random.shuffle(X_t[:, i])
shuff_acc = r2_score(Y_test, rf.predict(X_t))
scores[names[i]].append((acc-shuff_acc)/acc)
print "Features sorted by their score:"
print sorted([(round(np.mean(score), 4), feat) for
feat, score in scores.items()], reverse=True)
# Linear Model with L1 Lasso regularization
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_boston
boston = load_boston()
scaler = StandardScaler()
X = scaler.fit_transform(boston["data"])
Y = boston["target"]
names = boston["feature_names"]
lasso = Lasso(alpha=.3)
lasso.fit(X, Y)
print "Lasso model: ", pretty_print_linear(lasso.coef_, names, sort = True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment