Skip to content

Instantly share code, notes, and snippets.

@fannix
Created December 16, 2011 03:18
Show Gist options
  • Save fannix/1484277 to your computer and use it in GitHub Desktop.
Save fannix/1484277 to your computer and use it in GitHub Desktop.
Semi-supervised Naive Bayes
from sklearn.naive_bayes import EMNB, MultinomialNB, BernoulliNB
from sklearn.cross_validation import KFold
from sklearn.datasets import load_svmlight_file
from scipy.sparse import vstack
import numpy as np
X, y = load_svmlight_file("mpqa_en.vec")
y = np.asarray(y, np.int32)
n_labeled = int(0.8 * X.shape[0])
X_labeled = X[:n_labeled]
y_labeled = y[:n_labeled]
y_labeled[y_labeled == -1] = 0
X_unlabeled = X[n_labeled:]
y_unlabeled = y[n_labeled:]
y_unlabeled[:] = -1
kf = KFold(n_labeled, k = 10, indices=True)
clf1 = MultinomialNB()
clf = EMNB(MultinomialNB(alpha=1), verbose=False, n_iter=100)
i = 0
li_super = []
#supervised
for train_index, test_index in kf:
X_train, X_test = X_labeled[train_index], X_labeled[test_index]
y_train, y_test = y_labeled[train_index], y_labeled[test_index]
clf1.fit(X_train, y_train.flatten())
y_predicted = clf1.predict(X_test)
li_super.append(sum(y_predicted == y_test) / float(len(y_predicted)))
print np.mean(li_super)
kf = KFold(n_labeled, k = 10, indices=True)
li_semi = []
#semi-supervised
for train_index, test_index in kf:
X_train, X_test = X_labeled[train_index], X_labeled[test_index]
y_train, y_test = y_labeled[train_index], y_labeled[test_index]
X_ = vstack((X_train, X_unlabeled), format="csr")
y_ = np.vstack((y_train[:, np.newaxis], y_unlabeled[:, np.newaxis]))
clf.fit(X_, y_.flatten())
y_predicted = clf.predict(X_test)
li_semi.append(sum(y_predicted == y_test) / float(len(y_predicted)))
print np.mean(li_semi)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment