Skip to content

Instantly share code, notes, and snippets.

@nkt1546789
nkt1546789 / puwrapper.py
Last active April 12, 2019 20:16
A wrapper class for PU classification on Python (proposed by Elkan and Noto, 2008).
import numpy as np
from numpy import random
from sklearn import base
class PUWrapper(object):
def __init__(self,trad_clf,n_fold=5):
self._trad_clf=trad_clf
self._n_fold=n_fold
def fit(self,X,s):
@nkt1546789
nkt1546789 / contentextractor.py
Last active August 29, 2015 14:25
A wrapper of dragnet's content extractor for thresholding correctly.
from dragnet import content_extractor
classes=list(content_extractor._block_model.classes_)
positive_idx=classes.index(1)
def extract(html,block=False,threshold=0.2):
features,blocks=content_extractor.make_features(html)
scores=content_extractor._block_model.predict_proba(features)[:,positive_idx]
if block:
return [block for i,block in enumerate(blocks) if scores[i]>=threshold]
return " ".join([block.text for i,block in enumerate(blocks) if scores[i]>=threshold])
@nkt1546789
nkt1546789 / lrridge.py
Created July 23, 2015 08:45
Laplacian regularized Ridge Classifier (LRRidge) on Python. This is for semi-supervised classification tasks. In demo, we compared LRRidge with ordinal Ridge using gaussian kernel model.
import numpy as np
import matplotlib.pyplot as plt
from numpy import linalg,random
from sklearn.base import BaseEstimator
from sklearn import datasets,metrics
class LRRidge(BaseEstimator):
def __init__(self,alpha=1.,beta=1.,gamma=10.,k=10):
self.alpha=alpha
self.beta=beta
@nkt1546789
nkt1546789 / w2v_document_classification.py
Last active August 29, 2015 14:25
An implementation of document classification using w2v. We use livedoor corpus for evaluation.
# coding: utf-8
import os,codecs,re,pickle
import numpy as np
from gensim.models import word2vec
from gensim import matutils
basepath="/path/to/corpus_dir"
dir_names=["dokujo-tsushin","it-life-hack","kaden-channel","livedoor-homme","movie-enter","peachy","smax","sports-watch","topic-news"]
@nkt1546789
nkt1546789 / bsm.py
Last active February 6, 2018 14:56
Basic summarization model on Python.
# coding: utf-8 -*-
import MeCab
import numpy as np
m = MeCab.Tagger("-Ochasen")
def sent_tokenize(text):
if type(text) is unicode:
text = text.encode("utf8")
@nkt1546789
nkt1546789 / skmulti_demo.py
Created July 18, 2015 03:48
demo script for scikit-multilearn. should be executed in /path/to/scikit-multilearn.
from skmultilearn.dataset import Dataset
from skmultilearn.meta.br import BinaryRelevance
from sklearn.svm import SVC
from sklearn.metrics import hamming_loss
train_set=Dataset.load_dataset_dump("skmultilearn/data/scene-train.dump.bz2")
test_set=Dataset.load_dataset_dump("skmultilearn/data/scene-test.dump.bz2")
clf=BinaryRelevance(SVC(kernel="linear"))
clf.fit(train_set["X"],train_set["y"])
@nkt1546789
nkt1546789 / visualize_filetype.py
Created July 9, 2015 17:07
Visualization of filetype in your computer.
import os,re
from os import path
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
sns.set(style="white", context="talk")
HOMEPATH="/path/to/your/home_directory"
MAX_DEPTH=5
# coding: utf-8
from sklearn.svm import SVC
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import accuracy_score
from gensim.models import word2vec
import numpy as np
np.random.seed(0)
model=word2vec.Word2Vec.load("/path/to/your/model")
@nkt1546789
nkt1546789 / kitml.py
Last active August 29, 2015 14:22
Implementation of Kernel Information-Theoretic Metric Learning proposed by V.Davis et al.
import numpy as np
def KernelITML(K,constraints,dm=None,dc=None,gamma=1.0,max_iter=1000,stop_threshold=1e-3):
"""
K: initial kernel matrix.
constraints: array or list whose element is in the form of (delta,i,j), where delta=1 if (i,j) is must-link and delta=-1 if (i,j) is cannot-link.
dm: target distance for must-link. if not provided, dm is automatically selected.
dc: target distance for cannot-link.
gamma: trade-off parameter. gamma=1 gives stable solution.
max_iter: maximum number of iteration.
@nkt1546789
nkt1546789 / coo_mat.py
Created May 25, 2015 11:45
creating cooccurrence matrix on Python using scipy.sparse.coo_matrix
def create_cooccurrence_matrix(filename,tokenizer,window_size):
vocabulary={}
data=[]
row=[]
col=[]
for sentence in codecs.open(filename,"r","utf-8"):
sentence=sentence.strip()
tokens=[token for token in tokenizer(sentence) if token!=u""]
for pos,token in enumerate(tokens):
i=vocabulary.setdefault(token,len(vocabulary))