Created
December 6, 2014 00:28
-
-
Save richard-to/2f0b413fd7d13d7f0536 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import numpy as np | |
from sklearn import cross_validation | |
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer | |
from sklearn.feature_selection import SelectPercentile, f_classif, chi2 | |
from sklearn.naive_bayes import MultinomialNB, BernoulliNB | |
from sklearn.metrics import accuracy_score, classification_report | |
from sklearn.svm import SVC | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.pipeline import Pipeline | |
from sklearn.metrics import confusion_matrix | |
LABELS_MAP = {} | |
LABELS_LIST = [] | |
def load_data(): | |
""" | |
Loads data from csv file. | |
Returns: | |
A list of descriptions (features) and a list of corresponding labels | |
""" | |
features = [] | |
labels = [] | |
with open('data.csv', 'r') as infile: | |
reader = csv.reader(infile) | |
next(reader) | |
for row in reader: | |
features.append(row[0]) | |
labels.append(LABELS_MAP[row[1]]) | |
return features, labels | |
def most_common(lst): | |
""" | |
Gets the classification that occurs most in list. | |
""" | |
return max(set(lst), key=lst.count) | |
def main(): | |
# Load data from csv file | |
features, labels = load_data() | |
# Split 90% of data into training data and 10% into test data | |
features_train, features_test, labels_train, labels_test = cross_validation.train_test_split( | |
features, labels, test_size=0.1) | |
# Create logistic regression classifier, fit (train), and predict | |
clf_logistic_regressions = Pipeline([ | |
('vect', TfidfVectorizer(sublinear_tf=True, max_df=.95, min_df=.001, stop_words='english')), | |
('selector', SelectPercentile(f_classif, percentile=100)), | |
('clf', LogisticRegression()) | |
]) | |
clf_logistic_regressions.fit(features_train, labels_train) | |
pred_logistic_regression = clf_logistic_regressions.predict(features_test) | |
# Create multinomial naive bayes classifier, fit (train), and predict | |
clf_nb = Pipeline([ | |
('vect', CountVectorizer(stop_words='english', max_df=0.3, ngram_range=(1,4))), | |
('selector', SelectPercentile(f_classif, percentile=100)), | |
('clf', MultinomialNB()) | |
]) | |
clf_nb.fit(features_train, labels_train) | |
pred_nb = clf_nb.predict(features_test) | |
# Create SVC classifier, fit (train), and predict | |
clf_svc = Pipeline([ | |
('vect', TfidfVectorizer(sublinear_tf=True, max_df=5.0, stop_words='english')), | |
('selector', SelectPercentile(f_classif, percentile=85)), | |
('clf', SVC(kernel='rbf', C=6000, gamma=.0002)) | |
]) | |
clf_svc.fit(features_train, labels_train) | |
pred_svc = clf_svc.predict(features_test) | |
# Vote for the best classification from the three algorithms | |
pred = [] | |
for i in xrange(len(pred_logistic_regression)): | |
pred.append(most_common([pred_logistic_regression[i], pred_svc[i], pred_nb[i]])) | |
# For debugging, print out which examples the algorithm got wrong | |
print "Incorrect Classifications:" | |
print "--------------------------" | |
for i in xrange(len(pred)): | |
if pred[i] != labels_test[i]: | |
print LABELS_LIST[pred[i]], LABELS_LIST[labels_test[i]] | |
print features_test[i] | |
# Print metrics, accuracy, confusion for "ensemble" algorithm | |
print accuracy_score(labels_test, pred) | |
print classification_report(labels_test, pred, target_names=LABELS_LIST) | |
print confusion_matrix(labels_test, pred) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment