zermelozf/saga_with_prox.py

## saga_with_prox.py
"""
================================
Classification of text documents
================================

"""
from os.path import dirname, realpath
import sys
sys.path.append(dirname(realpath(__file__)) + "/../lightning/impl/tests")
import time

import numpy as np

from sklearn.datasets import fetch_20newsgroups_vectorized
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model.sag import get_auto_step_size

from lightning.impl.sag import SAGClassifier, SAGAClassifier
from test_sag import PySAGAClassifier


# Load News20 dataset from scikit-learn.
bunch = fetch_20newsgroups_vectorized(subset="all")
X = bunch.data
y = bunch.target

# y[y < y.mean()] = -1
# y[y >= y.mean()] = 1
# n_samples = 100
# X = X[:n_samples]
# y = y[:n_samples]

# X = X.toarray()

print X.shape
print type(X)

# Train / test split.
X_tr, X_te, y_tr, y_te = train_test_split(X, y,
                                          train_size=0.75,
                                          test_size=0.25,
                                          random_state=0)

max_squared_sum = np.max(np.sum(X_tr.toarray() * X_tr.toarray(), axis=1))
print(max_squared_sum)
alpha_scaled = .1 / X_tr.shape[0]
step = get_auto_step_size(max_squared_sum, alpha_scaled, 'log', False)

alpha = .1
clfs = (SAGClassifier(loss="log", eta=step, alpha=alpha, max_iter=100, verbose=False, random_state=0),
        SAGAClassifier(loss="log", eta=step, alpha=alpha, beta=0.0, max_iter=100, verbose=False, random_state=0),
        LogisticRegression(C=1. / (X_tr.shape[0] * alpha), solver='sag', max_iter=100, fit_intercept=False, random_state=0),
#         PySAGAClassifier(eta=1e-3, alpha=0.0, beta=1e-4, penalty=None, max_iter=10),
        )

print "--- Sparse ---"
for clf in clfs:
    print clf.__class__.__name__
    t = time.time()
    clf.fit(X_tr, y_tr)
    print "time = {}".format(time.time() - t)
    print "score = {}".format(clf.score(X_te, y_te))
#     print clf.coef_[0, :5]
#     print clf.predict(X_te)
#     clf.loss

# print "--- Dense ---"
# X_tr = X_tr.toarray()
# X_te = X_te.toarray()
# for clf in clfs:
#     print clf.__class__.__name__
#     t = time.time()
#     clf.fit(X_tr, y_tr)
#     print "time = {}".format(time.time() - t)
#     print "score = {}".format(clf.score(X_te, y_te))
# #     print clf.coef_
# #     print clf.predict(X_te)
#     clf.loss
	"""
	================================
	Classification of text documents
	================================

	"""
	from os.path import dirname, realpath
	import sys
	sys.path.append(dirname(realpath(__file__)) + "/../lightning/impl/tests")
	import time

	import numpy as np

	from sklearn.datasets import fetch_20newsgroups_vectorized
	from sklearn.cross_validation import train_test_split
	from sklearn.linear_model import LogisticRegression
	from sklearn.linear_model.sag import get_auto_step_size

	from lightning.impl.sag import SAGClassifier, SAGAClassifier
	from test_sag import PySAGAClassifier


	# Load News20 dataset from scikit-learn.
	bunch = fetch_20newsgroups_vectorized(subset="all")
	X = bunch.data
	y = bunch.target

	# y[y < y.mean()] = -1
	# y[y >= y.mean()] = 1
	# n_samples = 100
	# X = X[:n_samples]
	# y = y[:n_samples]

	# X = X.toarray()

	print X.shape
	print type(X)

	# Train / test split.
	X_tr, X_te, y_tr, y_te = train_test_split(X, y,
	train_size=0.75,
	test_size=0.25,
	random_state=0)

	max_squared_sum = np.max(np.sum(X_tr.toarray() * X_tr.toarray(), axis=1))
	print(max_squared_sum)
	alpha_scaled = .1 / X_tr.shape[0]
	step = get_auto_step_size(max_squared_sum, alpha_scaled, 'log', False)

	alpha = .1
	clfs = (SAGClassifier(loss="log", eta=step, alpha=alpha, max_iter=100, verbose=False, random_state=0),
	SAGAClassifier(loss="log", eta=step, alpha=alpha, beta=0.0, max_iter=100, verbose=False, random_state=0),
	LogisticRegression(C=1. / (X_tr.shape[0] * alpha), solver='sag', max_iter=100, fit_intercept=False, random_state=0),
	# PySAGAClassifier(eta=1e-3, alpha=0.0, beta=1e-4, penalty=None, max_iter=10),
	)

	print "--- Sparse ---"
	for clf in clfs:
	print clf.__class__.__name__
	t = time.time()
	clf.fit(X_tr, y_tr)
	print "time = {}".format(time.time() - t)
	print "score = {}".format(clf.score(X_te, y_te))
	# print clf.coef_[0, :5]
	# print clf.predict(X_te)
	# clf.loss

	# print "--- Dense ---"
	# X_tr = X_tr.toarray()
	# X_te = X_te.toarray()
	# for clf in clfs:
	# print clf.__class__.__name__
	# t = time.time()
	# clf.fit(X_tr, y_tr)
	# print "time = {}".format(time.time() - t)
	# print "score = {}".format(clf.score(X_te, y_te))
	# # print clf.coef_
	# # print clf.predict(X_te)
	# clf.loss