This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
from numpy import random | |
from sklearn import base | |
class PUWrapper(object): | |
def __init__(self,trad_clf,n_fold=5): | |
self._trad_clf=trad_clf | |
self._n_fold=n_fold | |
def fit(self,X,s): |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from dragnet import content_extractor | |
classes=list(content_extractor._block_model.classes_) | |
positive_idx=classes.index(1) | |
def extract(html,block=False,threshold=0.2): | |
features,blocks=content_extractor.make_features(html) | |
scores=content_extractor._block_model.predict_proba(features)[:,positive_idx] | |
if block: | |
return [block for i,block in enumerate(blocks) if scores[i]>=threshold] | |
return " ".join([block.text for i,block in enumerate(blocks) if scores[i]>=threshold]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import matplotlib.pyplot as plt | |
from numpy import linalg,random | |
from sklearn.base import BaseEstimator | |
from sklearn import datasets,metrics | |
class LRRidge(BaseEstimator): | |
def __init__(self,alpha=1.,beta=1.,gamma=10.,k=10): | |
self.alpha=alpha | |
self.beta=beta |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
import os,codecs,re,pickle | |
import numpy as np | |
from gensim.models import word2vec | |
from gensim import matutils | |
basepath="/path/to/corpus_dir" | |
dir_names=["dokujo-tsushin","it-life-hack","kaden-channel","livedoor-homme","movie-enter","peachy","smax","sports-watch","topic-news"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 -*- | |
import MeCab | |
import numpy as np | |
m = MeCab.Tagger("-Ochasen") | |
def sent_tokenize(text): | |
if type(text) is unicode: | |
text = text.encode("utf8") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from skmultilearn.dataset import Dataset | |
from skmultilearn.meta.br import BinaryRelevance | |
from sklearn.svm import SVC | |
from sklearn.metrics import hamming_loss | |
train_set=Dataset.load_dataset_dump("skmultilearn/data/scene-train.dump.bz2") | |
test_set=Dataset.load_dataset_dump("skmultilearn/data/scene-test.dump.bz2") | |
clf=BinaryRelevance(SVC(kernel="linear")) | |
clf.fit(train_set["X"],train_set["y"]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os,re | |
from os import path | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
import pandas as pd | |
sns.set(style="white", context="talk") | |
HOMEPATH="/path/to/your/home_directory" | |
MAX_DEPTH=5 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
from sklearn.svm import SVC | |
from sklearn.grid_search import GridSearchCV | |
from sklearn.metrics import accuracy_score | |
from gensim.models import word2vec | |
import numpy as np | |
np.random.seed(0) | |
model=word2vec.Word2Vec.load("/path/to/your/model") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
def KernelITML(K,constraints,dm=None,dc=None,gamma=1.0,max_iter=1000,stop_threshold=1e-3): | |
""" | |
K: initial kernel matrix. | |
constraints: array or list whose element is in the form of (delta,i,j), where delta=1 if (i,j) is must-link and delta=-1 if (i,j) is cannot-link. | |
dm: target distance for must-link. if not provided, dm is automatically selected. | |
dc: target distance for cannot-link. | |
gamma: trade-off parameter. gamma=1 gives stable solution. | |
max_iter: maximum number of iteration. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def create_cooccurrence_matrix(filename,tokenizer,window_size): | |
vocabulary={} | |
data=[] | |
row=[] | |
col=[] | |
for sentence in codecs.open(filename,"r","utf-8"): | |
sentence=sentence.strip() | |
tokens=[token for token in tokenizer(sentence) if token!=u""] | |
for pos,token in enumerate(tokens): | |
i=vocabulary.setdefault(token,len(vocabulary)) |