Skip to content

Instantly share code, notes, and snippets.

@TomDLT
Created November 9, 2015 09:19
Show Gist options
  • Save TomDLT/c48c044824d98f530456 to your computer and use it in GitHub Desktop.
Save TomDLT/c48c044824d98f530456 to your computer and use it in GitHub Desktop.
# Author: Tom Dupre la Tour <tom.dupre-la-tour@m4x.org>
# License: BSD 3 clause
from __future__ import print_function
from time import time
import numpy as np
import scipy.sparse as sp
import matplotlib.pyplot as plt
import pandas
import nimfa
from sklearn.utils.testing import ignore_warnings
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition.nmf import (NMF, _safe_compute_error,
_initialize_nmf)
from sklearn.externals.joblib import Memory
m = Memory(cachedir='.', verbose=0)
def plot_results(results_df):
if results_df is None:
return None
plt.figure(figsize=(16, 10))
colors = 'bgrcmk'
markers = 'ovs'
ax = plt.subplot(2, 3, 1)
for i, init in enumerate(np.unique(results_df['init'])):
plt.subplot(2, 3, i + 1, sharex=ax, sharey=ax)
for j, method in enumerate(np.unique(results_df['method'])):
selected_items = (results_df
[results_df['init'] == init]
[results_df['method'] == method])
plt.plot(selected_items['time'], selected_items['loss'],
color=colors[j % len(colors)], ls='-',
marker=markers[j % len(markers)],
label=method)
plt.legend(loc=0, fontsize='x-small')
plt.xlabel("Time (s)")
plt.ylabel("loss")
plt.title("%s" % init)
plt.show()
plt.close('all')
#X_shape is for avoiding hashing X
@ignore_warnings
@m.cache(ignore=['X'])
def bench_one(name, X, X_shape, clf_type, clf_params, n_components, init,
random_state):
W, H = _initialize_nmf(X, n_components, init, 1e-6, random_state)
if name[:5] == 'nimfa':
clf = clf_type(X, seed=None, W=W, H=H, **clf_params)
st = time()
clf_fit = clf()
end = time()
W = clf_fit.basis()
H = clf_fit.coef()
if sp.issparse(W):
W = W.todense()
H = H.todense()
W = np.asarray(W)
H = np.asarray(H)
else:
clf = clf_type(**clf_params)
st = time()
W = clf.fit_transform(X, W=W, H=H)
end = time()
H = clf.components_
this_loss = _safe_compute_error(X, W, H)
return this_loss, end - st
def run_bench(X, clfs, n_components, tol):
results = []
for name, clf_type, iter_range, clf_params in clfs:
print("______________training %s ______________" % name)
for rs, init in enumerate(('random', 'random', 'random',
'nndsvd', 'nndsvda', 'nndsvdar')):
print(init)
for itr in iter_range:
clf_params['max_iter'] = itr
if name[:5] == 'nimfa':
clf_params['rank'] = n_components
else:
clf_params['n_components'] = n_components
clf_params['tol'] = tol
clf_params['random_state'] = rs
clf_params['init'] = 'custom'
this_loss, duration = bench_one(name, X, X.shape, clf_type,
clf_params,
n_components, init, rs)
init_name = init + " " + str(rs)
results.append((name, this_loss, duration, init_name))
print("loss: %.8f, time: %.6f sec" % (this_loss, duration))
results_df = pandas.DataFrame(results,
columns="method loss time init".split())
print(" ")
return results_df
def load_20news():
print("Loading 20 newsgroups dataset...")
from sklearn.datasets import fetch_20newsgroups
dataset = fetch_20newsgroups(shuffle=True, random_state=1,
remove=('headers', 'footers', 'quotes'))
vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
tfidf = vectorizer.fit_transform(dataset.data)
return tfidf
def load_faces():
print("Loading Olivetti face dataset...")
from sklearn.datasets import fetch_olivetti_faces
faces = fetch_olivetti_faces(shuffle=True)
return faces.data
def load_data_rcv1():
print("Loading RCV1 dataset...")
from sklearn.datasets import fetch_rcv1
rcv1 = fetch_rcv1()
return rcv1.data
def load_data_mnist():
print("Loading MNIST dataset...")
from sklearn.datasets import fetch_mldata
mnist = fetch_mldata('MNIST original')
return mnist.data
#################################################################
def build_clfs(alpha, l1_ratio, iters, pg_iters, nimfa_iters):
clfs = [("Greedy", NMF, iters,
{'solver': 'greedy', 'alpha': alpha, 'l1_ratio': l1_ratio}),
("CD", NMF, iters,
{'solver': 'coordinate', 'alpha': alpha, 'l1_ratio': l1_ratio}),
("proj-grad", NMF, pg_iters,
{'solver': 'proj-grad', 'alpha': alpha, 'l1_ratio': l1_ratio,
'nls_max_iter': 10}),
("nimfa proj-grad", nimfa.Lsnmf, nimfa_iters,
{'test_conv': 1000}),
]
return clfs
if __name__ == "__main__":
results_20news = None
results_faces = None
results_rcv1 = None
results_mnist = None
# regularization
alpha = 0.
l1_ratio = 0.1
k = 10
"""20_News"""
iters = np.asarray(range(1, 200, 16))
pg_iters = np.asarray(range(1, 100, 8))
nimfa_iters = np.asarray(range(1, 50, 8))
clfs = build_clfs(alpha, l1_ratio, iters, pg_iters, nimfa_iters)
X_20news = load_20news()
results_20news = run_bench(X_20news, clfs, n_components=k, tol=1e-12)
"""Faces"""
iters = np.asarray(range(1, 100, 8))
pg_iters = np.asarray(range(1, 100, 8))
nimfa_iters = np.asarray(range(1, 50, 8))
clfs = build_clfs(alpha, l1_ratio, iters, pg_iters, nimfa_iters)
X_faces = load_faces()
results_faces = run_bench(X_faces, clfs, n_components=k, tol=1e-12)
"""RCV1"""
iters = np.asarray(range(1, 22, 4))
pg_iters = np.asarray(range(1, 10, 2))
clfs = build_clfs(alpha, l1_ratio, iters, pg_iters, pg_iters)
X_rcv1 = load_data_rcv1()
results_rcv1 = run_bench(X_rcv1, clfs, n_components=15, tol=1e-12)
"""MNIST"""
iters = np.asarray(range(1, 40, 4))
clfs = build_clfs(alpha, l1_ratio, iters, iters, iters)
X_mnist = load_data_mnist()
results_mnist = run_bench(X_mnist, clfs, n_components=k, tol=1e-12)
plot_results(results_20news)
plot_results(results_faces)
plot_results(results_rcv1)
plot_results(results_mnist)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment