he7d3r/demonstrate_GridSearchCV.py

## demonstrate_GridSearchCV.py
"""
Test GridSearchCV using a dataset obtained from a tsv file
"""
import csv
from sklearn import svm
from sklearn import metrics
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
#from revscores.scorers import LinearSVC
from revscores.features import (added_badwords_ratio, added_misspellings_ratio,
                                badwords_added, bytes_changed, chars_added,
                                day_of_week_in_utc, hour_of_day_in_utc,
                                is_custom_comment, is_mainspace,
                                is_previous_user_same, is_section_comment,
                                longest_repeated_char_added,
                                longest_token_added, misspellings_added,
                                numeric_chars_added, page_age_in_seconds,
                                prev_badwords, prev_misspellings, prev_words,
                                proportion_of_badwords_added,
                                proportion_of_markup_added,
                                proportion_of_misspellings_added,
                                proportion_of_numeric_added,
                                proportion_of_prev_badwords,
                                proportion_of_prev_misspellings,
                                proportion_of_symbolic_added,
                                proportion_of_uppercase_added,
                                seconds_since_last_page_edit,
                                seconds_since_last_user_edit, segments_added,
                                segments_removed, symbolic_chars_added,
                                uppercase_chars_added, user_age_in_seconds,
                                user_is_anon, user_is_bot, words_added,
                                words_removed)

features = [added_badwords_ratio, added_misspellings_ratio,
            badwords_added, bytes_changed, chars_added,
            day_of_week_in_utc, hour_of_day_in_utc,
            is_custom_comment, is_mainspace,
            is_previous_user_same, is_section_comment,
            longest_repeated_char_added,
            longest_token_added, misspellings_added,
            numeric_chars_added, page_age_in_seconds,
            prev_badwords, prev_misspellings, prev_words,
            proportion_of_badwords_added,
            proportion_of_markup_added,
            proportion_of_misspellings_added,
            proportion_of_numeric_added,
            proportion_of_prev_badwords,
            proportion_of_prev_misspellings,
            proportion_of_symbolic_added,
            proportion_of_uppercase_added,
            seconds_since_last_page_edit,
            seconds_since_last_user_edit, segments_added,
            segments_removed, symbolic_chars_added,
            uppercase_chars_added, user_age_in_seconds,
            user_is_anon, user_is_bot,
            words_added,
            words_removed]
#linear_svc_model = LinearSVC.MODEL(features, C=1.0, kernel='rbf')
data = []
with open('revscores.tsv', newline='') as tsvfile:
    reader = csv.reader(tsvfile, delimiter='\t')
    headers = next(reader)
    for row in reader:
        values = []
        for cell in row[:-1]:
            if cell == 'True':
                values.append(1.)
                cell = 1.
            elif cell == 'False':
                values.append(0.)
            else:
                values.append(float(cell))
        reverted = int(row[-1])
        data.append( (values, reverted) )

training_set, test_set = train_test_split(data, test_size=0.4, random_state=0)
estimator = svm.SVC()
#estimator.fit(*zip(*training_set))
#linear_svc_model.train(training_set)

y_true = [ y for x, y in test_set ]
#y_pred = [ linear_svc_model.svc.predict(x)[0] for x, y in test_set ]
#y_pred = [ estimator.predict(x)[0] for x, y in test_set ]
#print('== Classification Report ==')
#print(metrics.classification_report(y_true, y_pred))
#print(estimator.get_params())

# Based on http://scikit-learn.org/stable/auto_examples/grid_search_digits.html#parameter-estimation-using-grid-search-with-cross-validation
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-6, 1e-9, 1e-13, 1e-16], 'C': [1, 10, 100, 1000]}]#,
                    #{'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

scores = ['f1']#, 'roc_auc']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(svm.SVC(C=1), tuned_parameters, cv=5, scoring=score)
    clf.fit(*zip(*training_set))#(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_estimator_)
    print()
    print("Grid scores on development set:")
    print()
    for params, mean_score, scores in clf.grid_scores_:
        print("%0.3f (+/-%0.03f) for %r"
              % (mean_score, scores.std() / 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_pred = [ clf.predict(x)[0] for x, y in test_set ]# clf.predict(X_test)
    print(metrics.classification_report(y_true, y_pred))
    print()
	"""
	Test GridSearchCV using a dataset obtained from a tsv file
	"""
	import csv
	from sklearn import svm
	from sklearn import metrics
	from sklearn.cross_validation import train_test_split
	from sklearn.grid_search import GridSearchCV
	#from revscores.scorers import LinearSVC
	from revscores.features import (added_badwords_ratio, added_misspellings_ratio,
	badwords_added, bytes_changed, chars_added,
	day_of_week_in_utc, hour_of_day_in_utc,
	is_custom_comment, is_mainspace,
	is_previous_user_same, is_section_comment,
	longest_repeated_char_added,
	longest_token_added, misspellings_added,
	numeric_chars_added, page_age_in_seconds,
	prev_badwords, prev_misspellings, prev_words,
	proportion_of_badwords_added,
	proportion_of_markup_added,
	proportion_of_misspellings_added,
	proportion_of_numeric_added,
	proportion_of_prev_badwords,
	proportion_of_prev_misspellings,
	proportion_of_symbolic_added,
	proportion_of_uppercase_added,
	seconds_since_last_page_edit,
	seconds_since_last_user_edit, segments_added,
	segments_removed, symbolic_chars_added,
	uppercase_chars_added, user_age_in_seconds,
	user_is_anon, user_is_bot, words_added,
	words_removed)

	features = [added_badwords_ratio, added_misspellings_ratio,
	badwords_added, bytes_changed, chars_added,
	day_of_week_in_utc, hour_of_day_in_utc,
	is_custom_comment, is_mainspace,
	is_previous_user_same, is_section_comment,
	longest_repeated_char_added,
	longest_token_added, misspellings_added,
	numeric_chars_added, page_age_in_seconds,
	prev_badwords, prev_misspellings, prev_words,
	proportion_of_badwords_added,
	proportion_of_markup_added,
	proportion_of_misspellings_added,
	proportion_of_numeric_added,
	proportion_of_prev_badwords,
	proportion_of_prev_misspellings,
	proportion_of_symbolic_added,
	proportion_of_uppercase_added,
	seconds_since_last_page_edit,
	seconds_since_last_user_edit, segments_added,
	segments_removed, symbolic_chars_added,
	uppercase_chars_added, user_age_in_seconds,
	user_is_anon, user_is_bot,
	words_added,
	words_removed]
	#linear_svc_model = LinearSVC.MODEL(features, C=1.0, kernel='rbf')
	data = []
	with open('revscores.tsv', newline='') as tsvfile:
	reader = csv.reader(tsvfile, delimiter='\t')
	headers = next(reader)
	for row in reader:
	values = []
	for cell in row[:-1]:
	if cell == 'True':
	values.append(1.)
	cell = 1.
	elif cell == 'False':
	values.append(0.)
	else:
	values.append(float(cell))
	reverted = int(row[-1])
	data.append( (values, reverted) )

	training_set, test_set = train_test_split(data, test_size=0.4, random_state=0)
	estimator = svm.SVC()
	#estimator.fit(zip(training_set))
	#linear_svc_model.train(training_set)

	y_true = [ y for x, y in test_set ]
	#y_pred = [ linear_svc_model.svc.predict(x)[0] for x, y in test_set ]
	#y_pred = [ estimator.predict(x)[0] for x, y in test_set ]
	#print('== Classification Report ==')
	#print(metrics.classification_report(y_true, y_pred))
	#print(estimator.get_params())

	# Based on http://scikit-learn.org/stable/auto_examples/grid_search_digits.html#parameter-estimation-using-grid-search-with-cross-validation
	tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-6, 1e-9, 1e-13, 1e-16], 'C': [1, 10, 100, 1000]}]#,
	#{'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

	scores = ['f1']#, 'roc_auc']

	for score in scores:
	print("# Tuning hyper-parameters for %s" % score)
	print()

	clf = GridSearchCV(svm.SVC(C=1), tuned_parameters, cv=5, scoring=score)
	clf.fit(zip(training_set))#(X_train, y_train)

	print("Best parameters set found on development set:")
	print()
	print(clf.best_estimator_)
	print()
	print("Grid scores on development set:")
	print()
	for params, mean_score, scores in clf.grid_scores_:
	print("%0.3f (+/-%0.03f) for %r"
	% (mean_score, scores.std() / 2, params))
	print()

	print("Detailed classification report:")
	print()
	print("The model is trained on the full development set.")
	print("The scores are computed on the full evaluation set.")
	print()
	y_pred = [ clf.predict(x)[0] for x, y in test_set ]# clf.predict(X_test)
	print(metrics.classification_report(y_true, y_pred))
	print()