Skip to content

Instantly share code, notes, and snippets.

@nkt1546789
Last active May 12, 2022 15:13
  • Star 10 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save nkt1546789/d4ffebb452fe738b8aaa8005fc08a068 to your computer and use it in GitHub Desktop.
Learning Classifiers from positive and unlabeled data by sample weighting proposed by Elkan and Noto 2008.
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.cross_validation import StratifiedKFold
from sklearn.grid_search import GridSearchCV
class PUClassifier(object):
def __init__(self, trad_clf=None, n_folds=2):
self.trad_clf = trad_clf
self.n_folds = n_folds
def fit(self, X, s):
if self.trad_clf is None:
self.trad_clf = GridSearchCV(SGDClassifier(loss="log", penalty="l2"), param_grid={"alpha": np.logspace(-4, 0, 10)})
c = np.zeros(self.n_folds)
for i, (itr, ite) in enumerate(StratifiedKFold(s, n_folds=self.n_folds, shuffle=True)):
self.trad_clf.fit(X[itr], s[itr])
c[i] = self.trad_clf.predict_proba(X[ite][s[ite]==1])[:,1].mean()
self.c = c.mean()
return self
def sample(self, X, s):
if not hasattr(self, "c"):
self.fit(X, s)
X_positive = X[s==1]
X_unlabeled = X[s==0]
n_positive = X_positive.shape[0]
n_unlabeled = X_unlabeled.shape[0]
X_train = np.r_[X_positive, X_unlabeled, X_unlabeled]
y_train = np.concatenate([np.repeat(1, n_positive), np.repeat(1, n_unlabeled), np.repeat(0, n_unlabeled)])
self.trad_clf.fit(X, s)
p_unlabeled = self.trad_clf.predict_proba(X_unlabeled)[:,1]
w_positive = ((1 - self.c) / self.c) * (p_unlabeled / (1 - p_unlabeled))
w_negative = 1 - w_positive
sample_weight = np.concatenate([np.repeat(1.0, n_positive), w_positive, w_negative])
return X_train, y_train, sample_weight
if __name__ == '__main__':
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
from sklearn import metrics
np.random.seed(0)
n_positive = 100
n_negative = 500
n = n_positive + n_negative
mu1 = [0,0]
mu2 = [2,2]
Sigma1 = 0.1 * np.identity(2)
Sigma2 = 0.5 * np.identity(2)
X = np.r_[np.random.multivariate_normal(mu1, Sigma1, n_positive),
np.random.multivariate_normal(mu2, Sigma2, n_negative)]
y = np.concatenate([np.repeat(1, n_positive), np.repeat(0, n_negative)])
n_unlabeled = int(n_positive * 0.7)
s = y.copy()
s[:n_unlabeled] = 0
pu = PUClassifier(n_folds=5)
X_train, y_train, sample_weight = pu.sample(X, s)
alphas = np.logspace(-4, 0, 10)
class_weights = [{1:1}]
n_folds = 3
best_score = -np.inf
best_alpha = None
best_class_weight = None
for alpha, class_weight in itertools.product(alphas, class_weights):
scores = np.zeros(n_folds)
for i, (itr, ite) in enumerate(StratifiedKFold(y_train, n_folds=n_folds, shuffle=True)):
clf = SGDClassifier(loss="hinge", penalty="l2", alpha=alpha, class_weight=class_weight).fit(X_train[itr], y_train[itr], sample_weight=sample_weight[itr])
ypred = clf.predict(X_train[ite])
scores[i] = metrics.accuracy_score(y_train[ite], ypred, sample_weight=sample_weight[ite])
this_score = scores.mean()
print alpha, class_weight, this_score
if this_score > best_score:
best_score = this_score
best_alpha = alpha
best_class_weight = class_weight
print best_alpha, best_class_weight, best_score
clf = SGDClassifier(loss="hinge", penalty="l2", alpha=best_alpha, class_weight=best_class_weight).fit(X_train, y_train, sample_weight=sample_weight)
ypred = clf.predict(X[s==0])
#ypred = pu.trad_clf.predict_proba(X[s==0])[:,1]>=0.5*pu.c # <- this can also be used.
trad_ypred = pu.trad_clf.predict(X[s==0])
accuracy = metrics.accuracy_score(y[s==0], ypred)
trad_accuracy = metrics.accuracy_score(y[s==0], trad_ypred)
print "accuracy (traditional):", trad_accuracy
print "accuracy (non-traditional):", accuracy
# plot
ypred = clf.predict(X)
trad_ypred = pu.trad_clf.predict(X)
offset = 1.0
XX, YY = np.meshgrid(np.linspace(X[:,0].min()-offset,X[:,0].max()+offset,100),
np.linspace(X[:,1].min()-offset,X[:,1].max()+offset,100))
Z = clf.decision_function(np.c_[XX.ravel(),YY.ravel()])
Z = Z.reshape(XX.shape)
plt.figure(figsize=(10, 10))
plt.subplot(2, 2, 1)
colors = ["r", "b"]
plot_colors = [colors[yy] for yy in y==1]
plt.scatter(X[:,0], X[:,1], s=50, color=plot_colors)
plt.title("true")
plt.subplot(2, 2, 2)
colors = ["gray", "b"]
plot_colors = [colors[ss] for ss in s]
plt.scatter(X[:,0], X[:,1], s=50, color=plot_colors)
plt.title("positive and unlabeled data")
plt.subplot(2, 2, 3)
colors = ["r", "b"]
plot_colors = [colors[yy] for yy in trad_ypred]
plt.scatter(X[:,0], X[:,1], s=50, color=plot_colors)
plt.title("traditional (accuracy={})".format(trad_accuracy))
plt.subplot(2, 2, 4)
colors = ["r", "b"]
plot_colors = [colors[yy] for yy in ypred]
plt.contour(XX, YY, Z, levels=[0.0], colors="green")
plt.scatter(X[:,0], X[:,1], s=50, color=plot_colors)
plt.title("non-traditional (accuracy={})".format(accuracy))
plt.tight_layout()
plt.show()
#plt.savefig("pusampler_demo.png")
@nkt1546789
Copy link
Author

nkt1546789 commented Jul 21, 2016

The resulting figure:
pusampler_demo

@bibekiit
Copy link

  1. Hi how does making 70% of positive unlabeled actually help in improving accuracy?
  2. The traditional classifier is given a data with only 30 pos and 570 neg. Does it help the classifier?
  3. How does assigning weights to pos and neg help ?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment