Skip to content

Instantly share code, notes, and snippets.

@vanatteveldt
Last active June 17, 2018 14:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save vanatteveldt/3bf403c8f3c1f2195f8eb7ca22f33b6c to your computer and use it in GitHub Desktop.
Save vanatteveldt/3bf403c8f3c1f2195f8eb7ca22f33b6c to your computer and use it in GitHub Desktop.
import csv, random
data = [(d['text'], d['label']) for d in csv.DictReader(open('issues2.csv'))]
random.shuffle(data)
train_data = data[:1000]
test_data = data[1000:]
train_texts = [t for (t,i) in train_data]
train_labels = [i for (t,i) in train_data]
test_texts = [t for (t,i) in test_data]
test_labels = [i for (t,i) in test_data]
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf-svm', SGDClassifier(loss='hinge', penalty='l2',
alpha=1e-3, max_iter=10, random_state=42)),
])
text_clf = text_clf.fit(train_texts, train_labels)
predicted = text_clf.predict(train_texts)
print("Accuracy on training set:", np.mean(predicted == train_labels))
predicted = text_clf.predict(test_texts)
print("Accuracy on test set:", np.mean(predicted == test_labels))
$ env/bin/python test_sklearn.py
Accuracy on training set: 0.684
Accuracy on test set: 0.4605809128630705
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment