Skip to content

Instantly share code, notes, and snippets.

@vene
Forked from mblondel/sparse_multiclass_numba.py
Last active August 29, 2015 14:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save vene/3e3aeb1bbe960703f052 to your computer and use it in GitHub Desktop.
Save vene/3e3aeb1bbe960703f052 to your computer and use it in GitHub Desktop.
"""
(C) August 2013, Mathieu Blondel
# License: BSD 3 clause
Custom group support by Vlad Niculae (vlad@vene.ro)
This is a Numba-based reimplementation of the block coordinate descent solver
(without line search) described in the paper:
Block Coordinate Descent Algorithms for Large-scale Sparse Multiclass
Classification. Mathieu Blondel, Kazuhiro Seki, and Kuniaki Uehara.
Machine Learning, May 2013.
The reference Cython implementation is avaible from the "primal_cd" module in:
https://github.com/mblondel/lightning
The reference Cython implementation appears to be roughly 2x faster than this
implementation.
"""
from __future__ import print_function
from numba import jit
import numpy as np
import scipy.sparse as sp
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.extmath import safe_sparse_dot
from sklearn.utils import check_random_state
@jit("void(f8[:], i4[:], i4[:], i4[:], f8, f8[:])")
def _inv_step_sizes(X_data, X_indptr, group_ids, group_indptr, scale, out):
"""Compute the block-wise inverse step sizes (Lipschitz constants)."""
for group_no in range(len(group_indptr) - 1):
group = group_ids[group_indptr[group_no]:group_indptr[group_no + 1]]
sqnorm = 0
for j in group:
for k in xrange(X_indptr[j], X_indptr[j + 1]):
sqnorm += X_data[k] * X_data[k]
out[group_no] = scale * sqnorm
@jit("void(f8[:], i4[:], i4[:], i4[:], f8[:,:], i4[:], f8, f8[:])")
def _grad(X_data, X_indices, X_indptr, y, A, group, C, out):
"""Compute the partial gradient for the j^th block
(vector of size n_classes x group_size)."""
n_classes = out.shape[0]
for feat_no, j in enumerate(group):
for r in xrange(n_classes):
for k in xrange(X_indptr[j], X_indptr[j + 1]):
i = X_indices[k]
if y[i] == r:
continue
if A[r, i] > 0:
update = 2 * C * A[r, i] * X_data[k]
out[y[i], feat_no] -= update
out[r, feat_no] += update
@jit("void(f8[:], i4[:], f8, f8, f8[:], f8[:, :], f8)")
def _update_coef(grad, group, step_size, alpha, update, coef, tol):
"""Update the j^th block of the coefficient matrix."""
n_classes = grad.shape[0]
update_norm = 0
for feat_no, j in enumerate(group):
for r in xrange(n_classes):
update[r, feat_no] = coef[r, j] - step_size * grad[r, feat_no]
update_norm += update[r, feat_no] * update[r, feat_no]
update_norm = np.sqrt(update_norm)
if update_norm < tol:
scale = 0
else:
mu = alpha * step_size
scale = 1 - mu / update_norm
if scale < 0:
scale = 0
for feat_no, j in enumerate(group):
for r in xrange(n_classes):
old = coef[r, j]
coef[r, j] = scale * update[r, feat_no]
update[r, feat_no] = coef[r, j] - old
@jit("void(f8[:], i4[:], i4[:], i4[:], i4[:], f8[:], f8[:, :])")
def _update_A(X_data, X_indices, X_indptr, y, group, update, A):
"""Update matrix A (see paper)."""
n_classes = A.shape[0]
for feat_no, j in enumerate(group):
for r in xrange(n_classes):
for k in xrange(X_indptr[j], X_indptr[j + 1]):
i = X_indices[k]
if y[i] == r:
continue
A[r, i] += (update[r, feat_no] - update[y[i], feat_no]) * X_data[k]
@jit("f8(f8[:], f8[:], i4[:], f8)")
def _violation(grad, coef, group, alpha):
"""Compute optimality violation for the j^th block."""
n_classes = grad.shape[0]
coef_norm = 0
grad_norm = 0
for feat_no, j in enumerate(group):
for r in xrange(n_classes):
coef_norm += coef[r, j] * coef[r, j]
grad_norm += grad[r, feat_no] * grad[r, feat_no]
grad_norm = np.sqrt(grad_norm)
if coef_norm == 0:
violation = max(grad_norm - alpha, 0)
else:
violation = np.abs(grad_norm - alpha)
return violation
@jit("void(f8[:], i4[:], i4[:], i4[:], i4[:], i4[:], i4, f8, f8, f8, i4, f8[:,:])")
def _fit(X_data, X_indices, X_indptr, y, group_ids, group_indptr, max_iter,
alpha, C, tol, verbose, coef):
n_samples = y.shape[0]
n_groups = group_indptr.shape[0] - 1
n_classes, n_features = coef.shape
inv_step_sizes = np.zeros(n_groups, dtype=np.float64)
_inv_step_sizes(X_data, X_indptr, group_ids, group_indptr,
C * 4 * (n_classes - 1),
inv_step_sizes)
grad = np.empty((n_classes, 10), dtype=np.float64)
update = np.zeros((n_classes, 10), dtype=np.float64)
A = np.ones((n_classes, n_samples), dtype=np.float64)
rs = np.random.RandomState(None)
violation_init = 0
for it in xrange(max_iter):
violation_max = 0
for _ in xrange(n_groups):
j = rs.randint(n_groups - 1)
group = group_ids[group_indptr[j]:group_indptr[j + 1]]
if inv_step_sizes[j] == 0:
continue
grad.fill(0)
_grad(X_data, X_indices, X_indptr, y, A, group, C, grad)
violation = _violation(grad, coef, group, alpha)
_update_coef(grad, group, 1. / inv_step_sizes[j], alpha,
update, coef, tol)
_update_A(X_data, X_indices, X_indptr, y, group, update, A)
if violation > violation_max:
violation_max = violation
if it == 0:
violation_init = violation_max
if violation_max == violation_init == 0:
ratio = 0.0
else:
ratio = violation_max / violation_init
if verbose >= 1:
print("Violation {:.2f}".format(ratio))
if ratio < tol:
if verbose >= 1:
print("Converged at iter", it + 1)
break
def make_group_classification(n_samples=200, group_size=10,
n_classes=2, random_state=None):
""" Make a classification dataset for sparse group lasso
Implements Sec. 4 from "A note on the group lasso and a sparse group
lasso", by Friedman et al.
"""
rng = check_random_state(random_state)
n_nonzero = [10, 8, 6, 4, 2, 1, 0, 0, 0, 0]
n_features = len(n_nonzero) * group_size
X = rng.randn(n_samples, n_features)
coef = np.zeros((n_features, n_classes))
group_starts = range(0, n_features, group_size)
for start, this_n_nonzero in zip(group_starts, n_nonzero):
for klass in range(n_classes):
idx = np.arange(10)
rng.shuffle(idx)
coef[start:, klass][idx[:this_n_nonzero]] = rng.choice(
[-1, +1],
size=this_n_nonzero)
y = np.argmax(np.dot(X, coef), axis=1)
return X, y, group_starts
class SparseMulticlassClassifier(BaseEstimator, ClassifierMixin):
def __init__(self, alpha=1, C=1, max_iter=20, tol=0.05, verbose=0):
self.alpha = alpha
self.C = C
self.max_iter = max_iter
self.tol = tol
self.verbose = verbose
def fit(self, X, y, groups=None):
X = sp.csc_matrix(X)
n_samples, n_features = X.shape
self._enc = LabelEncoder()
y = self._enc.fit_transform(y).astype(np.int32)
n_classes = len(self._enc.classes_)
self.coef_ = np.zeros((n_classes, n_features), dtype=np.float64)
group_ids = np.hstack(groups)
group_offset = np.cumsum(np.array([0] + [len(k) for k in groups]))
_fit(X.data, X.indices, X.indptr, y, group_ids, group_offset,
self.max_iter, self.alpha, self.C, self.tol, self.verbose,
self.coef_)
return self
def decision_function(self, X):
return safe_sparse_dot(X, self.coef_.T)
def predict(self, X):
pred = self.decision_function(X)
pred = np.argmax(pred, axis=1)
return self._enc.inverse_transform(pred)
def n_nonzero(self, percentage=False):
n_nz = np.sum(np.sum(self.coef_ != 0, axis=0, dtype=bool))
if percentage:
n_nz /= float(self.coef_.shape[1])
return n_nz
if __name__ == '__main__':
import time
X, y, group_starts = make_group_classification(group_size=10, n_classes=3)
groups = [range(k, k + 10) for k in group_starts]
X = sp.csr_matrix(X)
print(X.shape, y.shape)
s = time.time()
clf_ = SparseMulticlassClassifier(C=0.8 / X.shape[0], alpha=1, tol=1e-3,
max_iter=50, verbose=1)
clf_.fit(X, y, groups)
training_time = time.time() - s
print("Numba")
print(training_time)
print(clf_.score(X, y))
print(clf_.n_nonzero(percentage=True))
print()
from lightning.classification import CDClassifier
clf = CDClassifier(C=0.1 / X.shape[0], alpha=0.1, tol=1e-3, max_iter=100,
multiclass=True, penalty="l1/l2", shrinking=False,
max_steps=0, selection="uniform", verbose=0)
s = time.time()
clf.fit(X, y)
training_time = time.time() - s
print("Cython")
print(training_time)
print(clf.score(X, y))
print(clf.n_nonzero(percentage=True))
print()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment