Skip to content

Instantly share code, notes, and snippets.

@agramfort
Created September 17, 2010 21:10
Show Gist options
  • Save agramfort/584961 to your computer and use it in GitHub Desktop.
Save agramfort/584961 to your computer and use it in GitHub Desktop.
"""
==========================
Feature Selection Shootout
==========================
Illustration of feature selection with :
- RFE-SVC
- Anova-SVC
- L1-Logistic Regression
"""
import numpy as np
from scikits.learn import svm
from scikits.learn import logistic
from scikits.learn.datasets import load_digits
from scikits.learn import feature_selection
from scikits.learn import cross_val
from scikits.learn.pipeline import Pipeline
from scikits.learn.metrics import zero_one
################################################################################
# Loading digits dataset
digits = load_digits()
X = digits.data
y = digits.target
images = digits.images
images_size = images[0].shape
# Keep only 0's and 1's
idx = np.logical_or(y == 1, y == 6)
X = X[idx]
y = y[idx]
cv = cross_val.StratifiedKFold(y, 2) # Cross-validation procedure
################################################################################
# Select features with SVC - RFE
svc = svm.SVC(kernel='linear')
rfecv = feature_selection.RFECV(estimator=svc, n_features=20, percentage=0.01,
loss_func=zero_one)
rfecv.fit(X, y, cv=cv)
print 'Optimal number of features with RFE : %d' % rfecv.support_.sum()
rfe_selected_voxels = rfecv.support_.reshape(images_size)
################################################################################
# Select features with Anova
anova = feature_selection.SelectPercentile(feature_selection.f_classif)
svc = svm.SVC(kernel='linear')
clf = Pipeline([('anova', anova), ('svc', svc)])
percentiles = (50, 80, 90)
best_score = None
anova_selected_voxels = None
for percentile in percentiles:
clf._set_params(anova__percentile=percentile)
scores = [clf.fit(X[train], y[train]).score(X[test], y[test]) \
for train, test in cv]
score = np.mean(scores)
if best_score is None or score > best_score:
best_score = score
anova_selected_voxels = anova.get_support().reshape(images_size)
print 'Optimal number of features with Anova-SVC : %d' % \
anova_selected_voxels.sum()
###############################################################################
# Select features with L1-Logistic
Cs = (0.01, 0.1, 1, 10)
best_score = None
logistic_selected_voxels = None
clf = logistic.LogisticRegression()
for C in Cs:
clf.C = C
scores = [clf.fit(X[train], y[train]).score(X[test], y[test]) \
for train, test in cv]
score = np.mean(scores)
if best_score is None or score > best_score:
best_score = score
logistic_selected_voxels = (clf.coef_ != 0).reshape(images_size)
print 'Optimal number of features with L1-Logistic Regression : %d' % \
logistic_selected_voxels.sum()
###############################################################################
# Plot selected voxels
import pylab as pl
pl.figure()
pl.subplot(1, 3, 1)
pl.imshow(rfe_selected_voxels, cmap=pl.cm.gray, interpolation='nearest')
pl.title('RFE')
pl.subplot(1, 3, 2)
pl.imshow(anova_selected_voxels, cmap=pl.cm.gray, interpolation='nearest')
pl.title('Anova')
pl.subplot(1, 3, 3)
pl.imshow(logistic_selected_voxels, cmap=pl.cm.gray, interpolation='nearest')
pl.title('Logistic L1')
pl.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment