Skip to content

Instantly share code, notes, and snippets.

@pedrokoblitz
Forked from oyvindse/train_test_svm.py
Last active August 29, 2015 13:56
Show Gist options
  • Save pedrokoblitz/8984223 to your computer and use it in GitHub Desktop.
Save pedrokoblitz/8984223 to your computer and use it in GitHub Desktop.
from sklearn.datasets import load_svmlight_file
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm.sparse import LinearSVC
from sklearn.cross_validation import StratifiedKFold
from sklearn import metrics
import numpy as np
X, y = load_svmlight_file("fr.vec")
y[y == -1] = 0
kf = StratifiedKFold(y, k = 10, indices=True)
#clf = MultinomialNB()
clf = LinearSVC()
mean_li = []
for train_index, test_index in kf:
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
clf.fit(X_train, y_train)
y_predicted = clf.predict(X_test)
print metrics.confusion_matrix(y_test, y_predicted)
print metrics.classification_report(y_test, y_predicted)
mean_li.append(sum(y_predicted == y_test) / float(len(y_test)))
print np.mean(mean_li)
#!/usr/bin/python
import sys
from numpy import loadtxt
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm.sparse import LinearSVC
my_data = loadtxt(sys.argv[1], delimiter='\t', dtype='S')
my_test_data = loadtxt(sys.argv[2], delimiter='\t', dtype='S')
text_clf = Pipeline([
('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', LinearSVC()),
])
print("Training...")
my_clf = text_clf.fit(my_data[:,4], my_data[:,3])
print("Done! \nClassifying test set...")
predicted = my_clf.predict(my_test_data[:,4])
print(np.mean(predicted == my_test_data[:,3]))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment