Create a gist now

Instantly share code, notes, and snippets.

Embed
What would you like to do?
A wrapper class for PU classification on Python (proposed by Elkan and Noto, 2008).
import numpy as np
from numpy import random
from sklearn import base
class PUWrapper(object):
def __init__(self,trad_clf,n_fold=5):
self._trad_clf=trad_clf
self._n_fold=n_fold
def fit(self,X,s):
self._trad_clf.fit(X,s)
Xp=X[s==1]
n=len(Xp)
cv_split=np.arange(n)*self._n_fold/n
cv_index=cv_split[random.permutation(n)]
cs=np.zeros(self._n_fold)
for k in xrange(self._n_fold):
Xptr=Xp[cv_index==k]
cs[k]=np.mean(self._trad_clf.predict_proba(Xptr)[:,1])
self.c_=cs.mean()
return self
def predict_proba(self,X):
proba=self._trad_clf.predict_proba(X)
return proba
def predict(self,X):
proba=self.predict_proba(X)[:,1]
return proba>=(0.5*self.c_)
@guicho271828

This comment has been minimized.

Show comment
Hide comment
@guicho271828

guicho271828 Aug 1, 2017

why did you split the examples by n_fold, take the mean for each split and then take the whole mean? The result should be the same as the mean of the whole data.

why did you split the examples by n_fold, take the mean for each split and then take the whole mean? The result should be the same as the mean of the whole data.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment