Skip to content

Instantly share code, notes, and snippets.

@nkt1546789
nkt1546789 / puclassifier.py
Last active May 12, 2022 15:13
Learning Classifiers from positive and unlabeled data by sample weighting proposed by Elkan and Noto 2008.
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.cross_validation import StratifiedKFold
from sklearn.grid_search import GridSearchCV
class PUClassifier(object):
def __init__(self, trad_clf=None, n_folds=2):
self.trad_clf = trad_clf
self.n_folds = n_folds
@nkt1546789
nkt1546789 / kde_regression.py
Last active August 25, 2021 22:36
An example of regression using kernel density estimation (KDE)
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KernelDensity
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
random_state = 1
n_samples = 200
@nkt1546789
nkt1546789 / lrridge.py
Created July 23, 2015 08:45
Laplacian regularized Ridge Classifier (LRRidge) on Python. This is for semi-supervised classification tasks. In demo, we compared LRRidge with ordinal Ridge using gaussian kernel model.
import numpy as np
import matplotlib.pyplot as plt
from numpy import linalg,random
from sklearn.base import BaseEstimator
from sklearn import datasets,metrics
class LRRidge(BaseEstimator):
def __init__(self,alpha=1.,beta=1.,gamma=10.,k=10):
self.alpha=alpha
self.beta=beta
@nkt1546789
nkt1546789 / coo_mat.py
Created May 25, 2015 11:45
creating cooccurrence matrix on Python using scipy.sparse.coo_matrix
def create_cooccurrence_matrix(filename,tokenizer,window_size):
vocabulary={}
data=[]
row=[]
col=[]
for sentence in codecs.open(filename,"r","utf-8"):
sentence=sentence.strip()
tokens=[token for token in tokenizer(sentence) if token!=u""]
for pos,token in enumerate(tokens):
i=vocabulary.setdefault(token,len(vocabulary))
@nkt1546789
nkt1546789 / pu_demo.py
Last active June 19, 2019 15:41
A demo code for PU classification proposed by Elkan and Noto 2008
import numpy as np
import matplotlib.pyplot as plt
from numpy import random
import seaborn as sns
from sklearn import metrics
from puwrapper import PUWrapper
from sklearn.linear_model import LogisticRegression,LogisticRegressionCV
sns.set_style("white")
random.seed(0)
import numpy as np
class RbfModelWrapper(object):
def __init__(self,model,gamma=1.,**kwds):
self._model=model
self.gamma=gamma
def fit(self,X,y):
X2=np.c_[np.sum(X**2,1)]
Phi=np.exp(-self.gamma*(X2+X2.T-2*X.dot(X.T)))
@nkt1546789
nkt1546789 / puwrapper.py
Last active April 12, 2019 20:16
A wrapper class for PU classification on Python (proposed by Elkan and Noto, 2008).
import numpy as np
from numpy import random
from sklearn import base
class PUWrapper(object):
def __init__(self,trad_clf,n_fold=5):
self._trad_clf=trad_clf
self._n_fold=n_fold
def fit(self,X,s):
@nkt1546789
nkt1546789 / farthest_first_traversal.py
Last active November 6, 2018 14:54
An implementation of farthest-first traversal, FFT (D. Hochbaum and D. Shmoys, 1985) on Python (with demo). FFT can be used to initialization for k-means clustering.
import numpy as np
def fft(X,D,k):
"""
X: input vectors (n_samples by dimensionality)
D: distance matrix (n_samples by n_samples)
k: number of centroids
out: indices of centroids
"""
n=X.shape[0]
@nkt1546789
nkt1546789 / bsm.py
Last active February 6, 2018 14:56
Basic summarization model on Python.
# coding: utf-8 -*-
import MeCab
import numpy as np
m = MeCab.Tagger("-Ochasen")
def sent_tokenize(text):
if type(text) is unicode:
text = text.encode("utf8")
@nkt1546789
nkt1546789 / collage_template_generation.py
Created June 25, 2017 02:07
Collage template generation
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
def generate_template(n, width, height, random_state=1, max_random_state=10000, offset=0):
L = [np.array([offset, offset, width-offset, height-offset])]
random_state_lists = stats.randint.rvs(0, max_random_state, size=(n-1, 4), random_state=random_state)
for random_state_list in random_state_lists:
n_areas = len(L)