richard-to/sklearn_example2.py

## sklearn_example2.py
import csv
import numpy as np
from sklearn import cross_validation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_selection import SelectPercentile, f_classif, chi2
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix


LABELS_MAP = {}
LABELS_LIST = []


def load_data():
    """
    Loads data from csv file.

    Returns:
        A list of descriptions (features) and a list of corresponding labels
    """
    features = []
    labels = []
    with open('data.csv', 'r') as infile:
        reader = csv.reader(infile)
        next(reader)
        for row in reader:
            features.append(row[0])
            labels.append(LABELS_MAP[row[1]])
    return features, labels


def most_common(lst):
    """
    Gets the classification that occurs most in list.
    """
    return max(set(lst), key=lst.count)


def main():

    # Load data from csv file
    features, labels = load_data()

    # Split 90% of data into training data and 10% into test data
    features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(
        features, labels, test_size=0.1)


    # Create logistic regression classifier, fit (train), and predict
    clf_logistic_regressions = Pipeline([
        ('vect', TfidfVectorizer(sublinear_tf=True, max_df=.95, min_df=.001, stop_words='english')),
        ('selector', SelectPercentile(f_classif, percentile=100)),
        ('clf', LogisticRegression())
    ])
    clf_logistic_regressions.fit(features_train, labels_train)
    pred_logistic_regression = clf_logistic_regressions.predict(features_test)


    # Create multinomial naive bayes classifier, fit (train), and predict
    clf_nb = Pipeline([
        ('vect', CountVectorizer(stop_words='english', max_df=0.3, ngram_range=(1,4))),
        ('selector', SelectPercentile(f_classif, percentile=100)),
        ('clf', MultinomialNB())
    ])
    clf_nb.fit(features_train, labels_train)
    pred_nb = clf_nb.predict(features_test)


    # Create SVC classifier, fit (train), and predict
    clf_svc = Pipeline([
        ('vect', TfidfVectorizer(sublinear_tf=True, max_df=5.0, stop_words='english')),
        ('selector', SelectPercentile(f_classif, percentile=85)),
        ('clf', SVC(kernel='rbf', C=6000, gamma=.0002))
    ])
    clf_svc.fit(features_train, labels_train)
    pred_svc = clf_svc.predict(features_test)


    # Vote for the best classification from the three algorithms
    pred = []
    for i in xrange(len(pred_logistic_regression)):
        pred.append(most_common([pred_logistic_regression[i], pred_svc[i], pred_nb[i]]))


    # For debugging, print out which examples the algorithm got wrong
    print
    print "Incorrect Classifications:"
    print "--------------------------"
    print
    for i in xrange(len(pred)):
        if pred[i] != labels_test[i]:
            print LABELS_LIST[pred[i]], LABELS_LIST[labels_test[i]]
            print features_test[i]
            print


    # Print metrics, accuracy, confusion for "ensemble" algorithm
    print accuracy_score(labels_test, pred)
    print classification_report(labels_test, pred, target_names=LABELS_LIST)
    print confusion_matrix(labels_test, pred)


if __name__ == '__main__':
    main()
	import csv
	import numpy as np
	from sklearn import cross_validation
	from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
	from sklearn.feature_selection import SelectPercentile, f_classif, chi2
	from sklearn.naive_bayes import MultinomialNB, BernoulliNB
	from sklearn.metrics import accuracy_score, classification_report
	from sklearn.svm import SVC
	from sklearn.linear_model import LogisticRegression
	from sklearn.pipeline import Pipeline
	from sklearn.metrics import confusion_matrix


	LABELS_MAP = {}
	LABELS_LIST = []


	def load_data():
	"""
	Loads data from csv file.

	Returns:
	A list of descriptions (features) and a list of corresponding labels
	"""
	features = []
	labels = []
	with open('data.csv', 'r') as infile:
	reader = csv.reader(infile)
	next(reader)
	for row in reader:
	features.append(row[0])
	labels.append(LABELS_MAP[row[1]])
	return features, labels


	def most_common(lst):
	"""
	Gets the classification that occurs most in list.
	"""
	return max(set(lst), key=lst.count)


	def main():

	# Load data from csv file
	features, labels = load_data()

	# Split 90% of data into training data and 10% into test data
	features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(
	features, labels, test_size=0.1)


	# Create logistic regression classifier, fit (train), and predict
	clf_logistic_regressions = Pipeline([
	('vect', TfidfVectorizer(sublinear_tf=True, max_df=.95, min_df=.001, stop_words='english')),
	('selector', SelectPercentile(f_classif, percentile=100)),
	('clf', LogisticRegression())
	])
	clf_logistic_regressions.fit(features_train, labels_train)
	pred_logistic_regression = clf_logistic_regressions.predict(features_test)


	# Create multinomial naive bayes classifier, fit (train), and predict
	clf_nb = Pipeline([
	('vect', CountVectorizer(stop_words='english', max_df=0.3, ngram_range=(1,4))),
	('selector', SelectPercentile(f_classif, percentile=100)),
	('clf', MultinomialNB())
	])
	clf_nb.fit(features_train, labels_train)
	pred_nb = clf_nb.predict(features_test)


	# Create SVC classifier, fit (train), and predict
	clf_svc = Pipeline([
	('vect', TfidfVectorizer(sublinear_tf=True, max_df=5.0, stop_words='english')),
	('selector', SelectPercentile(f_classif, percentile=85)),
	('clf', SVC(kernel='rbf', C=6000, gamma=.0002))
	])
	clf_svc.fit(features_train, labels_train)
	pred_svc = clf_svc.predict(features_test)


	# Vote for the best classification from the three algorithms
	pred = []
	for i in xrange(len(pred_logistic_regression)):
	pred.append(most_common([pred_logistic_regression[i], pred_svc[i], pred_nb[i]]))


	# For debugging, print out which examples the algorithm got wrong
	print
	print "Incorrect Classifications:"
	print "--------------------------"
	print
	for i in xrange(len(pred)):
	if pred[i] != labels_test[i]:
	print LABELS_LIST[pred[i]], LABELS_LIST[labels_test[i]]
	print features_test[i]
	print


	# Print metrics, accuracy, confusion for "ensemble" algorithm
	print accuracy_score(labels_test, pred)
	print classification_report(labels_test, pred, target_names=LABELS_LIST)
	print confusion_matrix(labels_test, pred)


	if __name__ == '__main__':
	main()