Skip to content

Instantly share code, notes, and snippets.

@nkt1546789
nkt1546789 / cesspr.py
Last active July 27, 2016 17:35
Content Extractor from HTML documents via Semi-Supervised PageRank.
import numpy as np
import scipy.sparse.linalg as la
import bs4
from scipy import sparse
def get_text_elements(elem):
if isinstance(elem, bs4.NavigableString):
if type(elem) not in (bs4.Comment, bs4.Declaration) and elem.strip():
yield elem
elif elem.name not in ('script', 'style'):
@nkt1546789
nkt1546789 / textrank.py
Last active July 4, 2016 08:55
An implementation of TextRank with cosine similarity. This code is based on graphranker.py (https://gist.github.com/nkt1546789/f5a8f3c5bb4445d141fe7dd03a84bcd1).
import numpy as np
from scipy import sparse
from sklearn import preprocessing
from graphranker import GraphRanker
class TextRank(GraphRanker):
def fit(self, texts):
self.texts = texts
dictionary = {}
data = []
@nkt1546789
nkt1546789 / graphranker.py
Last active July 4, 2016 08:56
For ranking vertices of a graph like pagerank and textrank.
import numpy as np
from copy import deepcopy
class GraphRanker(object):
def __init__(self, d=0.85, tol=1e-6, max_iters=200):
self.d = d
self.tol = tol
self.max_iters = max_iters
def fit(self, A):
import numpy as np
from scipy import sparse
from sklearn import preprocessing
from graphranker import GraphRanker
class TokenRank(GraphRanker):
def __init__(self, window=10, **kwds):
self.window = window
super(TokenRank, self).__init__(**kwds)
@nkt1546789
nkt1546789 / ranking_svm_demo.py
Created January 18, 2016 19:54
A demo code for ranking SVM.
"""
A demo code for ranking SVM
The data used in this code comes from http://download.joachims.org/svm_light/examples/example3.tar.gz
"""
import numpy as np
import itertools
from sklearn.linear_model import SGDClassifier
from sklearn.grid_search import GridSearchCV
np.random.seed(0)
@nkt1546789
nkt1546789 / farthest_first_traversal.py
Last active November 6, 2018 14:54
An implementation of farthest-first traversal, FFT (D. Hochbaum and D. Shmoys, 1985) on Python (with demo). FFT can be used to initialization for k-means clustering.
import numpy as np
def fft(X,D,k):
"""
X: input vectors (n_samples by dimensionality)
D: distance matrix (n_samples by n_samples)
k: number of centroids
out: indices of centroids
"""
n=X.shape[0]
@nkt1546789
nkt1546789 / pu_demo.py
Last active June 19, 2019 15:41
A demo code for PU classification proposed by Elkan and Noto 2008
import numpy as np
import matplotlib.pyplot as plt
from numpy import random
import seaborn as sns
from sklearn import metrics
from puwrapper import PUWrapper
from sklearn.linear_model import LogisticRegression,LogisticRegressionCV
sns.set_style("white")
random.seed(0)
@nkt1546789
nkt1546789 / file0.txt
Created October 27, 2015 13:35
Word2Vecを使った教師あり単語間関係分類 ref: http://qiita.com/nkt_dev/items/0f8fcfd9d09f3cfa6aa3
ブルーベリー is-a 果物
動物 has-a モルモット
動物 has-a ワタボウシタマリン
スポーツ is-a スポーツ
登山 is-a スポーツ
ロデオ is-a スポーツ
動物 has-a ユーラシアカワウソ
スポーツ has-a フリーダイビング
競馬 is-a スポーツ
スポーツ has-a ゴルフ
@nkt1546789
nkt1546789 / word_relation_data.py
Created October 27, 2015 11:48
Data for word relation learning in Japanese.
# coding: utf-8
data=[[u"スポーツ",u"競走"],
[u"スポーツ",u"跳躍"],
[u"スポーツ",u"投てき"],
[u"スポーツ",u"混成"],
[u"スポーツ",u"トライアスロン"],
[u"スポーツ",u"バイアスロン"],
[u"スポーツ",u"近代五種"],
[u"スポーツ",u"水泳"],
[u"スポーツ",u"競泳"],
import numpy as np
class RbfModelWrapper(object):
def __init__(self,model,gamma=1.,**kwds):
self._model=model
self.gamma=gamma
def fit(self,X,y):
X2=np.c_[np.sum(X**2,1)]
Phi=np.exp(-self.gamma*(X2+X2.T-2*X.dot(X.T)))