Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
Learning Classifiers from positive and unlabeled data by sample weighting proposed by Elkan and Noto 2008.
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.cross_validation import StratifiedKFold
from sklearn.grid_search import GridSearchCV
class PUClassifier(object):
def __init__(self, trad_clf=None, n_folds=2):
self.trad_clf = trad_clf
self.n_folds = n_folds
def fit(self, X, s):
if self.trad_clf is None:
self.trad_clf = GridSearchCV(SGDClassifier(loss="log", penalty="l2"), param_grid={"alpha": np.logspace(-4, 0, 10)})
c = np.zeros(self.n_folds)
for i, (itr, ite) in enumerate(StratifiedKFold(s, n_folds=self.n_folds, shuffle=True)):
self.trad_clf.fit(X[itr], s[itr])
c[i] = self.trad_clf.predict_proba(X[ite][s[ite]==1])[:,1].mean()
self.c = c.mean()
return self
def sample(self, X, s):
if not hasattr(self, "c"):
self.fit(X, s)
X_positive = X[s==1]
X_unlabeled = X[s==0]
n_positive = X_positive.shape[0]
n_unlabeled = X_unlabeled.shape[0]
X_train = np.r_[X_positive, X_unlabeled, X_unlabeled]
y_train = np.concatenate([np.repeat(1, n_positive), np.repeat(1, n_unlabeled), np.repeat(0, n_unlabeled)])
self.trad_clf.fit(X, s)
p_unlabeled = self.trad_clf.predict_proba(X_unlabeled)[:,1]
w_positive = ((1 - self.c) / self.c) * (p_unlabeled / (1 - p_unlabeled))
w_negative = 1 - w_positive
sample_weight = np.concatenate([np.repeat(1.0, n_positive), w_positive, w_negative])
return X_train, y_train, sample_weight
if __name__ == '__main__':
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
from sklearn import metrics
np.random.seed(0)
n_positive = 100
n_negative = 500
n = n_positive + n_negative
mu1 = [0,0]
mu2 = [2,2]
Sigma1 = 0.1 * np.identity(2)
Sigma2 = 0.5 * np.identity(2)
X = np.r_[np.random.multivariate_normal(mu1, Sigma1, n_positive),
np.random.multivariate_normal(mu2, Sigma2, n_negative)]
y = np.concatenate([np.repeat(1, n_positive), np.repeat(0, n_negative)])
n_unlabeled = int(n_positive * 0.7)
s = y.copy()
s[:n_unlabeled] = 0
pu = PUClassifier(n_folds=5)
X_train, y_train, sample_weight = pu.sample(X, s)
alphas = np.logspace(-4, 0, 10)
class_weights = [{1:1}]
n_folds = 3
best_score = -np.inf
best_alpha = None
best_class_weight = None
for alpha, class_weight in itertools.product(alphas, class_weights):
scores = np.zeros(n_folds)
for i, (itr, ite) in enumerate(StratifiedKFold(y_train, n_folds=n_folds, shuffle=True)):
clf = SGDClassifier(loss="hinge", penalty="l2", alpha=alpha, class_weight=class_weight).fit(X_train[itr], y_train[itr], sample_weight=sample_weight[itr])
ypred = clf.predict(X_train[ite])
scores[i] = metrics.accuracy_score(y_train[ite], ypred, sample_weight=sample_weight[ite])
this_score = scores.mean()
print alpha, class_weight, this_score
if this_score > best_score:
best_score = this_score
best_alpha = alpha
best_class_weight = class_weight
print best_alpha, best_class_weight, best_score
clf = SGDClassifier(loss="hinge", penalty="l2", alpha=best_alpha, class_weight=best_class_weight).fit(X_train, y_train, sample_weight=sample_weight)
ypred = clf.predict(X[s==0])
#ypred = pu.trad_clf.predict_proba(X[s==0])[:,1]>=0.5*pu.c # <- this can also be used.
trad_ypred = pu.trad_clf.predict(X[s==0])
accuracy = metrics.accuracy_score(y[s==0], ypred)
trad_accuracy = metrics.accuracy_score(y[s==0], trad_ypred)
print "accuracy (traditional):", trad_accuracy
print "accuracy (non-traditional):", accuracy
# plot
ypred = clf.predict(X)
trad_ypred = pu.trad_clf.predict(X)
offset = 1.0
XX, YY = np.meshgrid(np.linspace(X[:,0].min()-offset,X[:,0].max()+offset,100),
np.linspace(X[:,1].min()-offset,X[:,1].max()+offset,100))
Z = clf.decision_function(np.c_[XX.ravel(),YY.ravel()])
Z = Z.reshape(XX.shape)
plt.figure(figsize=(10, 10))
plt.subplot(2, 2, 1)
colors = ["r", "b"]
plot_colors = [colors[yy] for yy in y==1]
plt.scatter(X[:,0], X[:,1], s=50, color=plot_colors)
plt.title("true")
plt.subplot(2, 2, 2)
colors = ["gray", "b"]
plot_colors = [colors[ss] for ss in s]
plt.scatter(X[:,0], X[:,1], s=50, color=plot_colors)
plt.title("positive and unlabeled data")
plt.subplot(2, 2, 3)
colors = ["r", "b"]
plot_colors = [colors[yy] for yy in trad_ypred]
plt.scatter(X[:,0], X[:,1], s=50, color=plot_colors)
plt.title("traditional (accuracy={})".format(trad_accuracy))
plt.subplot(2, 2, 4)
colors = ["r", "b"]
plot_colors = [colors[yy] for yy in ypred]
plt.contour(XX, YY, Z, levels=[0.0], colors="green")
plt.scatter(X[:,0], X[:,1], s=50, color=plot_colors)
plt.title("non-traditional (accuracy={})".format(accuracy))
plt.tight_layout()
plt.show()
#plt.savefig("pusampler_demo.png")
Owner
nkt1546789 commented Jul 21, 2016 edited

The resulting figure:
pusampler_demo

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment