Skip to content

Instantly share code, notes, and snippets.

@he7d3r
Created December 24, 2014 20:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save he7d3r/6c1786be8482fdd2e554 to your computer and use it in GitHub Desktop.
Save he7d3r/6c1786be8482fdd2e554 to your computer and use it in GitHub Desktop.
Test GridSearchCV using a dataset obtained from a tsv file
"""
Test GridSearchCV using a dataset obtained from a tsv file
"""
import csv
from sklearn import svm
from sklearn import metrics
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
#from revscores.scorers import LinearSVC
from revscores.features import (added_badwords_ratio, added_misspellings_ratio,
badwords_added, bytes_changed, chars_added,
day_of_week_in_utc, hour_of_day_in_utc,
is_custom_comment, is_mainspace,
is_previous_user_same, is_section_comment,
longest_repeated_char_added,
longest_token_added, misspellings_added,
numeric_chars_added, page_age_in_seconds,
prev_badwords, prev_misspellings, prev_words,
proportion_of_badwords_added,
proportion_of_markup_added,
proportion_of_misspellings_added,
proportion_of_numeric_added,
proportion_of_prev_badwords,
proportion_of_prev_misspellings,
proportion_of_symbolic_added,
proportion_of_uppercase_added,
seconds_since_last_page_edit,
seconds_since_last_user_edit, segments_added,
segments_removed, symbolic_chars_added,
uppercase_chars_added, user_age_in_seconds,
user_is_anon, user_is_bot, words_added,
words_removed)
features = [added_badwords_ratio, added_misspellings_ratio,
badwords_added, bytes_changed, chars_added,
day_of_week_in_utc, hour_of_day_in_utc,
is_custom_comment, is_mainspace,
is_previous_user_same, is_section_comment,
longest_repeated_char_added,
longest_token_added, misspellings_added,
numeric_chars_added, page_age_in_seconds,
prev_badwords, prev_misspellings, prev_words,
proportion_of_badwords_added,
proportion_of_markup_added,
proportion_of_misspellings_added,
proportion_of_numeric_added,
proportion_of_prev_badwords,
proportion_of_prev_misspellings,
proportion_of_symbolic_added,
proportion_of_uppercase_added,
seconds_since_last_page_edit,
seconds_since_last_user_edit, segments_added,
segments_removed, symbolic_chars_added,
uppercase_chars_added, user_age_in_seconds,
user_is_anon, user_is_bot,
words_added,
words_removed]
#linear_svc_model = LinearSVC.MODEL(features, C=1.0, kernel='rbf')
data = []
with open('revscores.tsv', newline='') as tsvfile:
reader = csv.reader(tsvfile, delimiter='\t')
headers = next(reader)
for row in reader:
values = []
for cell in row[:-1]:
if cell == 'True':
values.append(1.)
cell = 1.
elif cell == 'False':
values.append(0.)
else:
values.append(float(cell))
reverted = int(row[-1])
data.append( (values, reverted) )
training_set, test_set = train_test_split(data, test_size=0.4, random_state=0)
estimator = svm.SVC()
#estimator.fit(*zip(*training_set))
#linear_svc_model.train(training_set)
y_true = [ y for x, y in test_set ]
#y_pred = [ linear_svc_model.svc.predict(x)[0] for x, y in test_set ]
#y_pred = [ estimator.predict(x)[0] for x, y in test_set ]
#print('== Classification Report ==')
#print(metrics.classification_report(y_true, y_pred))
#print(estimator.get_params())
# Based on http://scikit-learn.org/stable/auto_examples/grid_search_digits.html#parameter-estimation-using-grid-search-with-cross-validation
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-6, 1e-9, 1e-13, 1e-16], 'C': [1, 10, 100, 1000]}]#,
#{'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
scores = ['f1']#, 'roc_auc']
for score in scores:
print("# Tuning hyper-parameters for %s" % score)
print()
clf = GridSearchCV(svm.SVC(C=1), tuned_parameters, cv=5, scoring=score)
clf.fit(*zip(*training_set))#(X_train, y_train)
print("Best parameters set found on development set:")
print()
print(clf.best_estimator_)
print()
print("Grid scores on development set:")
print()
for params, mean_score, scores in clf.grid_scores_:
print("%0.3f (+/-%0.03f) for %r"
% (mean_score, scores.std() / 2, params))
print()
print("Detailed classification report:")
print()
print("The model is trained on the full development set.")
print("The scores are computed on the full evaluation set.")
print()
y_pred = [ clf.predict(x)[0] for x, y in test_set ]# clf.predict(X_test)
print(metrics.classification_report(y_true, y_pred))
print()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment