Skip to content

Instantly share code, notes, and snippets.

@zermelozf
Last active November 18, 2015 13:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save zermelozf/d4670d6cffea09f6e6f3 to your computer and use it in GitHub Desktop.
Save zermelozf/d4670d6cffea09f6e6f3 to your computer and use it in GitHub Desktop.
"""
================================
Classification of text documents
================================
"""
from os.path import dirname, realpath
import sys
sys.path.append(dirname(realpath(__file__)) + "/../lightning/impl/tests")
import time
import numpy as np
from sklearn.datasets import fetch_20newsgroups_vectorized
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model.sag import get_auto_step_size
from lightning.impl.sag import SAGClassifier, SAGAClassifier
from test_sag import PySAGAClassifier
# Load News20 dataset from scikit-learn.
bunch = fetch_20newsgroups_vectorized(subset="all")
X = bunch.data
y = bunch.target
# y[y < y.mean()] = -1
# y[y >= y.mean()] = 1
# n_samples = 100
# X = X[:n_samples]
# y = y[:n_samples]
# X = X.toarray()
print X.shape
print type(X)
# Train / test split.
X_tr, X_te, y_tr, y_te = train_test_split(X, y,
train_size=0.75,
test_size=0.25,
random_state=0)
max_squared_sum = np.max(np.sum(X_tr.toarray() * X_tr.toarray(), axis=1))
print(max_squared_sum)
alpha_scaled = .1 / X_tr.shape[0]
step = get_auto_step_size(max_squared_sum, alpha_scaled, 'log', False)
alpha = .1
clfs = (SAGClassifier(loss="log", eta=step, alpha=alpha, max_iter=100, verbose=False, random_state=0),
SAGAClassifier(loss="log", eta=step, alpha=alpha, beta=0.0, max_iter=100, verbose=False, random_state=0),
LogisticRegression(C=1. / (X_tr.shape[0] * alpha), solver='sag', max_iter=100, fit_intercept=False, random_state=0),
# PySAGAClassifier(eta=1e-3, alpha=0.0, beta=1e-4, penalty=None, max_iter=10),
)
print "--- Sparse ---"
for clf in clfs:
print clf.__class__.__name__
t = time.time()
clf.fit(X_tr, y_tr)
print "time = {}".format(time.time() - t)
print "score = {}".format(clf.score(X_te, y_te))
# print clf.coef_[0, :5]
# print clf.predict(X_te)
# clf.loss
# print "--- Dense ---"
# X_tr = X_tr.toarray()
# X_te = X_te.toarray()
# for clf in clfs:
# print clf.__class__.__name__
# t = time.time()
# clf.fit(X_tr, y_tr)
# print "time = {}".format(time.time() - t)
# print "score = {}".format(clf.score(X_te, y_te))
# # print clf.coef_
# # print clf.predict(X_te)
# clf.loss
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment