Created
November 9, 2015 09:19
-
-
Save TomDLT/c48c044824d98f530456 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Author: Tom Dupre la Tour <tom.dupre-la-tour@m4x.org> | |
# License: BSD 3 clause | |
from __future__ import print_function | |
from time import time | |
import numpy as np | |
import scipy.sparse as sp | |
import matplotlib.pyplot as plt | |
import pandas | |
import nimfa | |
from sklearn.utils.testing import ignore_warnings | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.decomposition.nmf import (NMF, _safe_compute_error, | |
_initialize_nmf) | |
from sklearn.externals.joblib import Memory | |
m = Memory(cachedir='.', verbose=0) | |
def plot_results(results_df): | |
if results_df is None: | |
return None | |
plt.figure(figsize=(16, 10)) | |
colors = 'bgrcmk' | |
markers = 'ovs' | |
ax = plt.subplot(2, 3, 1) | |
for i, init in enumerate(np.unique(results_df['init'])): | |
plt.subplot(2, 3, i + 1, sharex=ax, sharey=ax) | |
for j, method in enumerate(np.unique(results_df['method'])): | |
selected_items = (results_df | |
[results_df['init'] == init] | |
[results_df['method'] == method]) | |
plt.plot(selected_items['time'], selected_items['loss'], | |
color=colors[j % len(colors)], ls='-', | |
marker=markers[j % len(markers)], | |
label=method) | |
plt.legend(loc=0, fontsize='x-small') | |
plt.xlabel("Time (s)") | |
plt.ylabel("loss") | |
plt.title("%s" % init) | |
plt.show() | |
plt.close('all') | |
#X_shape is for avoiding hashing X | |
@ignore_warnings | |
@m.cache(ignore=['X']) | |
def bench_one(name, X, X_shape, clf_type, clf_params, n_components, init, | |
random_state): | |
W, H = _initialize_nmf(X, n_components, init, 1e-6, random_state) | |
if name[:5] == 'nimfa': | |
clf = clf_type(X, seed=None, W=W, H=H, **clf_params) | |
st = time() | |
clf_fit = clf() | |
end = time() | |
W = clf_fit.basis() | |
H = clf_fit.coef() | |
if sp.issparse(W): | |
W = W.todense() | |
H = H.todense() | |
W = np.asarray(W) | |
H = np.asarray(H) | |
else: | |
clf = clf_type(**clf_params) | |
st = time() | |
W = clf.fit_transform(X, W=W, H=H) | |
end = time() | |
H = clf.components_ | |
this_loss = _safe_compute_error(X, W, H) | |
return this_loss, end - st | |
def run_bench(X, clfs, n_components, tol): | |
results = [] | |
for name, clf_type, iter_range, clf_params in clfs: | |
print("______________training %s ______________" % name) | |
for rs, init in enumerate(('random', 'random', 'random', | |
'nndsvd', 'nndsvda', 'nndsvdar')): | |
print(init) | |
for itr in iter_range: | |
clf_params['max_iter'] = itr | |
if name[:5] == 'nimfa': | |
clf_params['rank'] = n_components | |
else: | |
clf_params['n_components'] = n_components | |
clf_params['tol'] = tol | |
clf_params['random_state'] = rs | |
clf_params['init'] = 'custom' | |
this_loss, duration = bench_one(name, X, X.shape, clf_type, | |
clf_params, | |
n_components, init, rs) | |
init_name = init + " " + str(rs) | |
results.append((name, this_loss, duration, init_name)) | |
print("loss: %.8f, time: %.6f sec" % (this_loss, duration)) | |
results_df = pandas.DataFrame(results, | |
columns="method loss time init".split()) | |
print(" ") | |
return results_df | |
def load_20news(): | |
print("Loading 20 newsgroups dataset...") | |
from sklearn.datasets import fetch_20newsgroups | |
dataset = fetch_20newsgroups(shuffle=True, random_state=1, | |
remove=('headers', 'footers', 'quotes')) | |
vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english') | |
tfidf = vectorizer.fit_transform(dataset.data) | |
return tfidf | |
def load_faces(): | |
print("Loading Olivetti face dataset...") | |
from sklearn.datasets import fetch_olivetti_faces | |
faces = fetch_olivetti_faces(shuffle=True) | |
return faces.data | |
def load_data_rcv1(): | |
print("Loading RCV1 dataset...") | |
from sklearn.datasets import fetch_rcv1 | |
rcv1 = fetch_rcv1() | |
return rcv1.data | |
def load_data_mnist(): | |
print("Loading MNIST dataset...") | |
from sklearn.datasets import fetch_mldata | |
mnist = fetch_mldata('MNIST original') | |
return mnist.data | |
################################################################# | |
def build_clfs(alpha, l1_ratio, iters, pg_iters, nimfa_iters): | |
clfs = [("Greedy", NMF, iters, | |
{'solver': 'greedy', 'alpha': alpha, 'l1_ratio': l1_ratio}), | |
("CD", NMF, iters, | |
{'solver': 'coordinate', 'alpha': alpha, 'l1_ratio': l1_ratio}), | |
("proj-grad", NMF, pg_iters, | |
{'solver': 'proj-grad', 'alpha': alpha, 'l1_ratio': l1_ratio, | |
'nls_max_iter': 10}), | |
("nimfa proj-grad", nimfa.Lsnmf, nimfa_iters, | |
{'test_conv': 1000}), | |
] | |
return clfs | |
if __name__ == "__main__": | |
results_20news = None | |
results_faces = None | |
results_rcv1 = None | |
results_mnist = None | |
# regularization | |
alpha = 0. | |
l1_ratio = 0.1 | |
k = 10 | |
"""20_News""" | |
iters = np.asarray(range(1, 200, 16)) | |
pg_iters = np.asarray(range(1, 100, 8)) | |
nimfa_iters = np.asarray(range(1, 50, 8)) | |
clfs = build_clfs(alpha, l1_ratio, iters, pg_iters, nimfa_iters) | |
X_20news = load_20news() | |
results_20news = run_bench(X_20news, clfs, n_components=k, tol=1e-12) | |
"""Faces""" | |
iters = np.asarray(range(1, 100, 8)) | |
pg_iters = np.asarray(range(1, 100, 8)) | |
nimfa_iters = np.asarray(range(1, 50, 8)) | |
clfs = build_clfs(alpha, l1_ratio, iters, pg_iters, nimfa_iters) | |
X_faces = load_faces() | |
results_faces = run_bench(X_faces, clfs, n_components=k, tol=1e-12) | |
"""RCV1""" | |
iters = np.asarray(range(1, 22, 4)) | |
pg_iters = np.asarray(range(1, 10, 2)) | |
clfs = build_clfs(alpha, l1_ratio, iters, pg_iters, pg_iters) | |
X_rcv1 = load_data_rcv1() | |
results_rcv1 = run_bench(X_rcv1, clfs, n_components=15, tol=1e-12) | |
"""MNIST""" | |
iters = np.asarray(range(1, 40, 4)) | |
clfs = build_clfs(alpha, l1_ratio, iters, iters, iters) | |
X_mnist = load_data_mnist() | |
results_mnist = run_bench(X_mnist, clfs, n_components=k, tol=1e-12) | |
plot_results(results_20news) | |
plot_results(results_faces) | |
plot_results(results_rcv1) | |
plot_results(results_mnist) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment