Skip to content

Instantly share code, notes, and snippets.

Last active Aug 29, 2015
What would you like to do?
IMDB review Sentiment Analysis based on Support Vector Machine
Sentiment Analysis using sklearn
* sklearn LinearSVC
* 10-fold cross validation
* accuracy 88.45%
import numpy as np
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.cross_validation import KFold
imdb_review = load_files('imdb1')
X = np.array(
y = np.array(
kf = KFold(2000, n_folds=10)
accuracy = []
fold = 0
for train_index, test_index in kf:
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
vect = TfidfVectorizer()
X_train_tfidf = vect.fit_transform(X_train)
X_test_tfidf = vect.fit_transform(X_test)
text_clf = Pipeline([("tfidf", TfidfVectorizer(sublinear_tf=True)),
("svc", LinearSVC())]), y_train)
a= text_clf.score(X_test, y_test)
print '[INFO]\tFold %d Accuracy: %f' % (fold, a)
fold += 1
avgAccuracy = sum(accuracy) / fold
print '[INFO]\tAccuracy: %f' % avgAccuracy
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment