Skip to content

Instantly share code, notes, and snippets.

@denzilc
Created May 23, 2011 19:37
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save denzilc/987394 to your computer and use it in GitHub Desktop.
Save denzilc/987394 to your computer and use it in GitHub Desktop.
High difference in classifier accuracies with LinearSVC and SVC
import numpy as np
import cPickle as pickle
from scipy import sparse
from scikits.learn.preprocessing.sparse import Normalizer
from scikits.learn import svm
from scikits.learn.grid_search import GridSearchCV
from scikits.learn.metrics.metrics import f1_score, classification_report,\
confusion_matrix, precision_recall_fscore_support
from scikits.learn.cross_val import StratifiedKFold
from numpy import array
# Initialize default C and gamma values
C_start, C_end, C_step = -3, 4, 2
''' Extracts Class Labels from feature vectors '''
def extract_class_labels(totafeats):
train, target_labels = zip(*totalfeats)
target_names = sorted(set(target_labels))
targets = [target_names.index(l) for l in target_labels]
return target_names,targets
''' Generate a CSR sparse matrix from the NLTK feature set'''
def gen_csr_matrix(totalfeats,uniq_feat_list):
# Create I,J,V for COO sparse matrix format
i_val = []
j_val = []
v_val = []
for row, (feats,label) in enumerate(totalfeats) :
for key,value in feats.iteritems():
if value != 0.0 and value != False:
i_val.append(row)
j_val.append(uniq_feat_list.index(key))
if value == True:
v_val.append(1.0)
else:
v_val.append(value)
# Generate Sparse Matrix
I = np.array(i_val)
J = np.array(j_val)
V = np.array(v_val)
sparse_feats = sparse.coo_matrix((V,(I,J))).tocsr()
print "Size of Sparse Matrix : ",sparse_feats.get_shape()
return sparse_feats
''' Loads a Feature Set and returns the dump'''
def load_feats(name):
if name == 'fs1':
# Load the pickled feature set
print 'Loading pickled feature set '
feature_set1 = open('fs1.dat','rb')
totalfeats = pickle.load(feature_set1)
feature_set1.close()
print 'Feature Set1 loaded!'
elif name == 'fs2':
# Load the pickled feature set
print 'Loading pickled feature set '
feature_set2 = open('fs2.dat','rb')
totalfeats = pickle.load(feature_set2)
feature_set2.close()
print 'Feature Set2 loaded!'
elif name == 'fs3':
# Load the pickled feature set
print 'Loading pickled feature set '
feature_set3 = open('fs3.dat','rb')
totalfeats = pickle.load(feature_set3)
feature_set3.close()
print 'Feature Set3 loaded!'
else :
print 'Please specify correct feature set'
return totalfeats
if __name__ == "__main__":
cross_fold = 10
uniq_feat_set = set()
# Load the Feature Set
# totalfeats = load_feats('fs1')
totalfeats = load_feats('fs1')
# Extract Class Labels
target_names, targets = extract_class_labels(totalfeats)
for i,name in enumerate(target_names):
print "Class " + str(i) + " corresponds to", name
# Find out size of columns
for feats,label in totalfeats:
for key in feats:
uniq_feat_set.add(key)
rows = len(targets)
cols = len(uniq_feat_set)
print "Size Of Matrix :", rows,cols
# Convert to list to create dict-column mapping
uniq_feat_list = list(uniq_feat_set)
sparse_feats = gen_csr_matrix(totalfeats, uniq_feat_list)
# # Convert to Dense if required
# dense_feats = sparse_feats.todense()
# print "Size of Dense Matrix : ", dense_feats.shape
target_labels = np.array(targets)
print "Size of Target Label Vector : ",target_labels.size
sparse_feats.eliminate_zeros()
X = Normalizer().transform(sparse_feats, copy=True)
Y = target_labels
folds = StratifiedKFold(Y, cross_fold, indices=True)
train, test = iter(StratifiedKFold(Y, 2, indices = True)).next()
# Generate grid search values for C, gamma
C_val = 2. ** np.arange(C_start, C_end + C_step, C_step)
alpha_val = 10. ** np.arange(-5,-16,-1)
grid_clf = svm.sparse.LinearSVC()
# grid_clf = svm.sparse.SVC(kernel = 'linear')
print grid_clf
params = {'C': C_val, 'tol' : alpha_val}
grid_search = GridSearchCV(grid_clf , params, score_func = f1_score)
grid_search.fit(X[train], Y[train], cv = StratifiedKFold(Y[train],10, indices = True))
y_true, y_pred = Y[test], grid_search.predict(X[test])
print "Classification report for the best estimator: "
print grid_search.best_estimator
print "Tuned for with optimal value: %0.3f" % f1_score(y_true, y_pred)
print classification_report(y_true, y_pred)
# print "Grid scores:"
# pprint(grid_search.grid_scores_)
print "Best score: %0.3f" % grid_search.best_score
best_parameters = grid_search.best_estimator._get_params()
print "Best C: %0.3f " % best_parameters['C']
print "Best tolerance: %0.16f " %best_parameters['tol']
accuracies = []
hate_precisions = []
hate_recalls = []
hate_f1s = []
plate_precisions = []
plate_recalls = []
plate_f1s = []
# clf = svm.sparse.SVC(kernel = 'linear', C = best_parameters['C'], tol = best_parameters['tol'])
clf = svm.sparse.LinearSVC(C = best_parameters['C'], tol = best_parameters['tol'])
print clf
for i, (train,test) in enumerate(folds):
# Train the model
svm = clf.fit(X[train], Y[train])
# Predict the values in the test class
Y_pred = clf.predict(X[test])
# print "Number of Support Vectors :", svm.support_vectors_.shape[0]
# Confusion Matrix
conf_mat = confusion_matrix(Y[test], Y_pred)
# print conf_mat
prec,rec,f1,sup = precision_recall_fscore_support(Y[test], Y_pred)
# print prec,rec,f1,sup
tp = conf_mat[0][0] + conf_mat[1][1]
total = tp + conf_mat[0][1] + conf_mat[1][0]
accuracy = (tp/float(total))
accuracies.append(accuracy)
hate_precisions.append(prec[target_names.index('hate')])
plate_precisions.append(prec[target_names.index('plate')])
hate_recalls.append(rec[target_names.index('hate')])
plate_recalls.append(rec[target_names.index('plate')])
hate_f1s.append(f1[target_names.index('hate')])
plate_f1s.append(f1[target_names.index('plate')])
print 'Accuracy Mean : ', sum(accuracies) / float(cross_fold)
print 'Accuracy Variance: ', float(array(accuracies).var())
print 'Hate Precision Mean:', sum(hate_precisions) / float(cross_fold)
print 'Hate Precision Variance:', float(array(hate_precisions).var())
print 'Hate Recall Mean: ', sum(hate_recalls) / float(cross_fold)
print 'Hate Recall Variance: ', float(array(hate_recalls).var())
print 'Hate F-Measure Mean: ', sum(hate_f1s) / float(cross_fold)
print 'Hate F-Measure Variance: ', float(array(hate_f1s).var())
print 'Plate Precision Mean: ', sum(plate_precisions) / float(cross_fold)
print 'Plate Precision Variance: ', float(array(plate_precisions).var())
print 'Plate Recall Mean:', sum(plate_recalls) / float(cross_fold)
print 'Plate Recall Variance: ', float(array(plate_recalls).var())
print 'Plate F-Measure Mean: ', (sum(plate_f1s)) / float(cross_fold)
print 'Plate F-Measure Variance: ', float(array(plate_f1s).var())
@pprett
Copy link

pprett commented May 24, 2011

@mcenley: I've created a cleaned-up version of your script [1]. Again, SVMs are not scale invariant - scaling or normalizing is crucial for good results. In your original script you use Normalizer which does length normalization (L1 norm of each row equals 1). Personally, I only use it when I work with word frequencies - but not when I have heterogeneous features which are measured on different scales (your setting?). For the latter I prefer centering (zero mean, unit variance) or normalization to the range [-1,1] or [0,1].

Given that libsvm and liblinear optimize slightly different objective function using different optimization techniques their results are not too far off (see below). I do find it strange, however, that SVC has different results for dense and sparse data... we'll have to investigate further.

best,
Peter

LinearSVC
Accuracy: 0.6864
F-Score: 0.7196

SVC
Accuracy: 0.6568
F-Score: 0.7071

[1] https://gist.github.com/988586

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment