-
-
Save denzilc/987394 to your computer and use it in GitHub Desktop.
import numpy as np | |
import cPickle as pickle | |
from scipy import sparse | |
from scikits.learn.preprocessing.sparse import Normalizer | |
from scikits.learn import svm | |
from scikits.learn.grid_search import GridSearchCV | |
from scikits.learn.metrics.metrics import f1_score, classification_report,\ | |
confusion_matrix, precision_recall_fscore_support | |
from scikits.learn.cross_val import StratifiedKFold | |
from numpy import array | |
# Initialize default C and gamma values | |
C_start, C_end, C_step = -3, 4, 2 | |
''' Extracts Class Labels from feature vectors ''' | |
def extract_class_labels(totafeats): | |
train, target_labels = zip(*totalfeats) | |
target_names = sorted(set(target_labels)) | |
targets = [target_names.index(l) for l in target_labels] | |
return target_names,targets | |
''' Generate a CSR sparse matrix from the NLTK feature set''' | |
def gen_csr_matrix(totalfeats,uniq_feat_list): | |
# Create I,J,V for COO sparse matrix format | |
i_val = [] | |
j_val = [] | |
v_val = [] | |
for row, (feats,label) in enumerate(totalfeats) : | |
for key,value in feats.iteritems(): | |
if value != 0.0 and value != False: | |
i_val.append(row) | |
j_val.append(uniq_feat_list.index(key)) | |
if value == True: | |
v_val.append(1.0) | |
else: | |
v_val.append(value) | |
# Generate Sparse Matrix | |
I = np.array(i_val) | |
J = np.array(j_val) | |
V = np.array(v_val) | |
sparse_feats = sparse.coo_matrix((V,(I,J))).tocsr() | |
print "Size of Sparse Matrix : ",sparse_feats.get_shape() | |
return sparse_feats | |
''' Loads a Feature Set and returns the dump''' | |
def load_feats(name): | |
if name == 'fs1': | |
# Load the pickled feature set | |
print 'Loading pickled feature set ' | |
feature_set1 = open('fs1.dat','rb') | |
totalfeats = pickle.load(feature_set1) | |
feature_set1.close() | |
print 'Feature Set1 loaded!' | |
elif name == 'fs2': | |
# Load the pickled feature set | |
print 'Loading pickled feature set ' | |
feature_set2 = open('fs2.dat','rb') | |
totalfeats = pickle.load(feature_set2) | |
feature_set2.close() | |
print 'Feature Set2 loaded!' | |
elif name == 'fs3': | |
# Load the pickled feature set | |
print 'Loading pickled feature set ' | |
feature_set3 = open('fs3.dat','rb') | |
totalfeats = pickle.load(feature_set3) | |
feature_set3.close() | |
print 'Feature Set3 loaded!' | |
else : | |
print 'Please specify correct feature set' | |
return totalfeats | |
if __name__ == "__main__": | |
cross_fold = 10 | |
uniq_feat_set = set() | |
# Load the Feature Set | |
# totalfeats = load_feats('fs1') | |
totalfeats = load_feats('fs1') | |
# Extract Class Labels | |
target_names, targets = extract_class_labels(totalfeats) | |
for i,name in enumerate(target_names): | |
print "Class " + str(i) + " corresponds to", name | |
# Find out size of columns | |
for feats,label in totalfeats: | |
for key in feats: | |
uniq_feat_set.add(key) | |
rows = len(targets) | |
cols = len(uniq_feat_set) | |
print "Size Of Matrix :", rows,cols | |
# Convert to list to create dict-column mapping | |
uniq_feat_list = list(uniq_feat_set) | |
sparse_feats = gen_csr_matrix(totalfeats, uniq_feat_list) | |
# # Convert to Dense if required | |
# dense_feats = sparse_feats.todense() | |
# print "Size of Dense Matrix : ", dense_feats.shape | |
target_labels = np.array(targets) | |
print "Size of Target Label Vector : ",target_labels.size | |
sparse_feats.eliminate_zeros() | |
X = Normalizer().transform(sparse_feats, copy=True) | |
Y = target_labels | |
folds = StratifiedKFold(Y, cross_fold, indices=True) | |
train, test = iter(StratifiedKFold(Y, 2, indices = True)).next() | |
# Generate grid search values for C, gamma | |
C_val = 2. ** np.arange(C_start, C_end + C_step, C_step) | |
alpha_val = 10. ** np.arange(-5,-16,-1) | |
grid_clf = svm.sparse.LinearSVC() | |
# grid_clf = svm.sparse.SVC(kernel = 'linear') | |
print grid_clf | |
params = {'C': C_val, 'tol' : alpha_val} | |
grid_search = GridSearchCV(grid_clf , params, score_func = f1_score) | |
grid_search.fit(X[train], Y[train], cv = StratifiedKFold(Y[train],10, indices = True)) | |
y_true, y_pred = Y[test], grid_search.predict(X[test]) | |
print "Classification report for the best estimator: " | |
print grid_search.best_estimator | |
print "Tuned for with optimal value: %0.3f" % f1_score(y_true, y_pred) | |
print classification_report(y_true, y_pred) | |
# print "Grid scores:" | |
# pprint(grid_search.grid_scores_) | |
print "Best score: %0.3f" % grid_search.best_score | |
best_parameters = grid_search.best_estimator._get_params() | |
print "Best C: %0.3f " % best_parameters['C'] | |
print "Best tolerance: %0.16f " %best_parameters['tol'] | |
accuracies = [] | |
hate_precisions = [] | |
hate_recalls = [] | |
hate_f1s = [] | |
plate_precisions = [] | |
plate_recalls = [] | |
plate_f1s = [] | |
# clf = svm.sparse.SVC(kernel = 'linear', C = best_parameters['C'], tol = best_parameters['tol']) | |
clf = svm.sparse.LinearSVC(C = best_parameters['C'], tol = best_parameters['tol']) | |
print clf | |
for i, (train,test) in enumerate(folds): | |
# Train the model | |
svm = clf.fit(X[train], Y[train]) | |
# Predict the values in the test class | |
Y_pred = clf.predict(X[test]) | |
# print "Number of Support Vectors :", svm.support_vectors_.shape[0] | |
# Confusion Matrix | |
conf_mat = confusion_matrix(Y[test], Y_pred) | |
# print conf_mat | |
prec,rec,f1,sup = precision_recall_fscore_support(Y[test], Y_pred) | |
# print prec,rec,f1,sup | |
tp = conf_mat[0][0] + conf_mat[1][1] | |
total = tp + conf_mat[0][1] + conf_mat[1][0] | |
accuracy = (tp/float(total)) | |
accuracies.append(accuracy) | |
hate_precisions.append(prec[target_names.index('hate')]) | |
plate_precisions.append(prec[target_names.index('plate')]) | |
hate_recalls.append(rec[target_names.index('hate')]) | |
plate_recalls.append(rec[target_names.index('plate')]) | |
hate_f1s.append(f1[target_names.index('hate')]) | |
plate_f1s.append(f1[target_names.index('plate')]) | |
print 'Accuracy Mean : ', sum(accuracies) / float(cross_fold) | |
print 'Accuracy Variance: ', float(array(accuracies).var()) | |
print 'Hate Precision Mean:', sum(hate_precisions) / float(cross_fold) | |
print 'Hate Precision Variance:', float(array(hate_precisions).var()) | |
print 'Hate Recall Mean: ', sum(hate_recalls) / float(cross_fold) | |
print 'Hate Recall Variance: ', float(array(hate_recalls).var()) | |
print 'Hate F-Measure Mean: ', sum(hate_f1s) / float(cross_fold) | |
print 'Hate F-Measure Variance: ', float(array(hate_f1s).var()) | |
print 'Plate Precision Mean: ', sum(plate_precisions) / float(cross_fold) | |
print 'Plate Precision Variance: ', float(array(plate_precisions).var()) | |
print 'Plate Recall Mean:', sum(plate_recalls) / float(cross_fold) | |
print 'Plate Recall Variance: ', float(array(plate_recalls).var()) | |
print 'Plate F-Measure Mean: ', (sum(plate_f1s)) / float(cross_fold) | |
print 'Plate F-Measure Variance: ', float(array(plate_f1s).var()) |
@mcenley: I've created a cleaned-up version of your script [1]. Again, SVMs are not scale invariant - scaling or normalizing is crucial for good results. In your original script you use Normalizer which does length normalization (L1 norm of each row equals 1). Personally, I only use it when I work with word frequencies - but not when I have heterogeneous features which are measured on different scales (your setting?). For the latter I prefer centering (zero mean, unit variance) or normalization to the range [-1,1] or [0,1].
Given that libsvm and liblinear optimize slightly different objective function using different optimization techniques their results are not too far off (see below). I do find it strange, however, that SVC has different results for dense and sparse data... we'll have to investigate further.
best,
Peter
LinearSVC
Accuracy: 0.6864
F-Score: 0.7196
SVC
Accuracy: 0.6568
F-Score: 0.7071
I have shared the FS2.dat file here : https://docs.google.com/leaf?id=0B0GLJLxdKPLqOGMyOTU3Y2UtNzAxZS00NjBiLTk5MTMtMDdjMjAwMDIyNDZj&hl=en_US
and FS1.data file here : https://docs.google.com/leaf?id=0B0GLJLxdKPLqOGM5M2ZkZmYtOGMxZi00MDcxLThhYjctOGE2MDBhZGY1YThl&hl=en_US