Created
May 23, 2011 19:37
-
-
Save denzilc/987394 to your computer and use it in GitHub Desktop.
High difference in classifier accuracies with LinearSVC and SVC
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import cPickle as pickle | |
from scipy import sparse | |
from scikits.learn.preprocessing.sparse import Normalizer | |
from scikits.learn import svm | |
from scikits.learn.grid_search import GridSearchCV | |
from scikits.learn.metrics.metrics import f1_score, classification_report,\ | |
confusion_matrix, precision_recall_fscore_support | |
from scikits.learn.cross_val import StratifiedKFold | |
from numpy import array | |
# Initialize default C and gamma values | |
C_start, C_end, C_step = -3, 4, 2 | |
''' Extracts Class Labels from feature vectors ''' | |
def extract_class_labels(totafeats): | |
train, target_labels = zip(*totalfeats) | |
target_names = sorted(set(target_labels)) | |
targets = [target_names.index(l) for l in target_labels] | |
return target_names,targets | |
''' Generate a CSR sparse matrix from the NLTK feature set''' | |
def gen_csr_matrix(totalfeats,uniq_feat_list): | |
# Create I,J,V for COO sparse matrix format | |
i_val = [] | |
j_val = [] | |
v_val = [] | |
for row, (feats,label) in enumerate(totalfeats) : | |
for key,value in feats.iteritems(): | |
if value != 0.0 and value != False: | |
i_val.append(row) | |
j_val.append(uniq_feat_list.index(key)) | |
if value == True: | |
v_val.append(1.0) | |
else: | |
v_val.append(value) | |
# Generate Sparse Matrix | |
I = np.array(i_val) | |
J = np.array(j_val) | |
V = np.array(v_val) | |
sparse_feats = sparse.coo_matrix((V,(I,J))).tocsr() | |
print "Size of Sparse Matrix : ",sparse_feats.get_shape() | |
return sparse_feats | |
''' Loads a Feature Set and returns the dump''' | |
def load_feats(name): | |
if name == 'fs1': | |
# Load the pickled feature set | |
print 'Loading pickled feature set ' | |
feature_set1 = open('fs1.dat','rb') | |
totalfeats = pickle.load(feature_set1) | |
feature_set1.close() | |
print 'Feature Set1 loaded!' | |
elif name == 'fs2': | |
# Load the pickled feature set | |
print 'Loading pickled feature set ' | |
feature_set2 = open('fs2.dat','rb') | |
totalfeats = pickle.load(feature_set2) | |
feature_set2.close() | |
print 'Feature Set2 loaded!' | |
elif name == 'fs3': | |
# Load the pickled feature set | |
print 'Loading pickled feature set ' | |
feature_set3 = open('fs3.dat','rb') | |
totalfeats = pickle.load(feature_set3) | |
feature_set3.close() | |
print 'Feature Set3 loaded!' | |
else : | |
print 'Please specify correct feature set' | |
return totalfeats | |
if __name__ == "__main__": | |
cross_fold = 10 | |
uniq_feat_set = set() | |
# Load the Feature Set | |
# totalfeats = load_feats('fs1') | |
totalfeats = load_feats('fs1') | |
# Extract Class Labels | |
target_names, targets = extract_class_labels(totalfeats) | |
for i,name in enumerate(target_names): | |
print "Class " + str(i) + " corresponds to", name | |
# Find out size of columns | |
for feats,label in totalfeats: | |
for key in feats: | |
uniq_feat_set.add(key) | |
rows = len(targets) | |
cols = len(uniq_feat_set) | |
print "Size Of Matrix :", rows,cols | |
# Convert to list to create dict-column mapping | |
uniq_feat_list = list(uniq_feat_set) | |
sparse_feats = gen_csr_matrix(totalfeats, uniq_feat_list) | |
# # Convert to Dense if required | |
# dense_feats = sparse_feats.todense() | |
# print "Size of Dense Matrix : ", dense_feats.shape | |
target_labels = np.array(targets) | |
print "Size of Target Label Vector : ",target_labels.size | |
sparse_feats.eliminate_zeros() | |
X = Normalizer().transform(sparse_feats, copy=True) | |
Y = target_labels | |
folds = StratifiedKFold(Y, cross_fold, indices=True) | |
train, test = iter(StratifiedKFold(Y, 2, indices = True)).next() | |
# Generate grid search values for C, gamma | |
C_val = 2. ** np.arange(C_start, C_end + C_step, C_step) | |
alpha_val = 10. ** np.arange(-5,-16,-1) | |
grid_clf = svm.sparse.LinearSVC() | |
# grid_clf = svm.sparse.SVC(kernel = 'linear') | |
print grid_clf | |
params = {'C': C_val, 'tol' : alpha_val} | |
grid_search = GridSearchCV(grid_clf , params, score_func = f1_score) | |
grid_search.fit(X[train], Y[train], cv = StratifiedKFold(Y[train],10, indices = True)) | |
y_true, y_pred = Y[test], grid_search.predict(X[test]) | |
print "Classification report for the best estimator: " | |
print grid_search.best_estimator | |
print "Tuned for with optimal value: %0.3f" % f1_score(y_true, y_pred) | |
print classification_report(y_true, y_pred) | |
# print "Grid scores:" | |
# pprint(grid_search.grid_scores_) | |
print "Best score: %0.3f" % grid_search.best_score | |
best_parameters = grid_search.best_estimator._get_params() | |
print "Best C: %0.3f " % best_parameters['C'] | |
print "Best tolerance: %0.16f " %best_parameters['tol'] | |
accuracies = [] | |
hate_precisions = [] | |
hate_recalls = [] | |
hate_f1s = [] | |
plate_precisions = [] | |
plate_recalls = [] | |
plate_f1s = [] | |
# clf = svm.sparse.SVC(kernel = 'linear', C = best_parameters['C'], tol = best_parameters['tol']) | |
clf = svm.sparse.LinearSVC(C = best_parameters['C'], tol = best_parameters['tol']) | |
print clf | |
for i, (train,test) in enumerate(folds): | |
# Train the model | |
svm = clf.fit(X[train], Y[train]) | |
# Predict the values in the test class | |
Y_pred = clf.predict(X[test]) | |
# print "Number of Support Vectors :", svm.support_vectors_.shape[0] | |
# Confusion Matrix | |
conf_mat = confusion_matrix(Y[test], Y_pred) | |
# print conf_mat | |
prec,rec,f1,sup = precision_recall_fscore_support(Y[test], Y_pred) | |
# print prec,rec,f1,sup | |
tp = conf_mat[0][0] + conf_mat[1][1] | |
total = tp + conf_mat[0][1] + conf_mat[1][0] | |
accuracy = (tp/float(total)) | |
accuracies.append(accuracy) | |
hate_precisions.append(prec[target_names.index('hate')]) | |
plate_precisions.append(prec[target_names.index('plate')]) | |
hate_recalls.append(rec[target_names.index('hate')]) | |
plate_recalls.append(rec[target_names.index('plate')]) | |
hate_f1s.append(f1[target_names.index('hate')]) | |
plate_f1s.append(f1[target_names.index('plate')]) | |
print 'Accuracy Mean : ', sum(accuracies) / float(cross_fold) | |
print 'Accuracy Variance: ', float(array(accuracies).var()) | |
print 'Hate Precision Mean:', sum(hate_precisions) / float(cross_fold) | |
print 'Hate Precision Variance:', float(array(hate_precisions).var()) | |
print 'Hate Recall Mean: ', sum(hate_recalls) / float(cross_fold) | |
print 'Hate Recall Variance: ', float(array(hate_recalls).var()) | |
print 'Hate F-Measure Mean: ', sum(hate_f1s) / float(cross_fold) | |
print 'Hate F-Measure Variance: ', float(array(hate_f1s).var()) | |
print 'Plate Precision Mean: ', sum(plate_precisions) / float(cross_fold) | |
print 'Plate Precision Variance: ', float(array(plate_precisions).var()) | |
print 'Plate Recall Mean:', sum(plate_recalls) / float(cross_fold) | |
print 'Plate Recall Variance: ', float(array(plate_recalls).var()) | |
print 'Plate F-Measure Mean: ', (sum(plate_f1s)) / float(cross_fold) | |
print 'Plate F-Measure Variance: ', float(array(plate_f1s).var()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
@mcenley: I've created a cleaned-up version of your script [1]. Again, SVMs are not scale invariant - scaling or normalizing is crucial for good results. In your original script you use Normalizer which does length normalization (L1 norm of each row equals 1). Personally, I only use it when I work with word frequencies - but not when I have heterogeneous features which are measured on different scales (your setting?). For the latter I prefer centering (zero mean, unit variance) or normalization to the range [-1,1] or [0,1].
Given that libsvm and liblinear optimize slightly different objective function using different optimization techniques their results are not too far off (see below). I do find it strange, however, that SVC has different results for dense and sparse data... we'll have to investigate further.
best,
Peter
LinearSVC
Accuracy: 0.6864
F-Score: 0.7196
SVC
Accuracy: 0.6568
F-Score: 0.7071
[1] https://gist.github.com/988586