vene/sparse_multiclass_numba.py

## sparse_multiclass_numba.py
"""
(C) August 2013, Mathieu Blondel
# License: BSD 3 clause

Custom group support by Vlad Niculae (vlad@vene.ro)

This is a Numba-based reimplementation of the block coordinate descent solver
(without line search) described in the paper:

    Block Coordinate Descent Algorithms for Large-scale Sparse Multiclass
    Classification.  Mathieu Blondel, Kazuhiro Seki, and Kuniaki Uehara.
    Machine Learning, May 2013.

The reference Cython implementation is avaible from the "primal_cd" module in:
    https://github.com/mblondel/lightning

The reference Cython implementation appears to be roughly 2x faster than this
implementation.
"""
from __future__ import print_function
from numba import jit

import numpy as np
import scipy.sparse as sp

from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.extmath import safe_sparse_dot
from sklearn.utils import check_random_state


@jit("void(f8[:], i4[:], i4[:], i4[:], f8, f8[:])")
def _inv_step_sizes(X_data, X_indptr, group_ids, group_indptr, scale, out):
    """Compute the block-wise inverse step sizes (Lipschitz constants)."""
    for group_no in range(len(group_indptr) - 1):
        group = group_ids[group_indptr[group_no]:group_indptr[group_no + 1]]
        sqnorm = 0
        for j in group:
            for k in xrange(X_indptr[j], X_indptr[j + 1]):
                sqnorm += X_data[k] * X_data[k]
        out[group_no] = scale * sqnorm


@jit("void(f8[:], i4[:], i4[:], i4[:], f8[:,:], i4[:], f8, f8[:])")
def _grad(X_data, X_indices, X_indptr, y, A, group, C, out):
    """Compute the partial gradient for the j^th block
       (vector of size n_classes x group_size)."""
    n_classes = out.shape[0]

    for feat_no, j in enumerate(group):
        for r in xrange(n_classes):
            for k in xrange(X_indptr[j], X_indptr[j + 1]):
                i = X_indices[k]

                if y[i] == r:
                    continue

                if A[r, i] > 0:
                    update = 2 * C * A[r, i] * X_data[k]
                    out[y[i], feat_no] -= update
                    out[r, feat_no] += update


@jit("void(f8[:], i4[:], f8, f8, f8[:], f8[:, :], f8)")
def _update_coef(grad, group, step_size, alpha, update, coef, tol):
    """Update the j^th block of the coefficient matrix."""
    n_classes = grad.shape[0]

    update_norm = 0

    for feat_no, j in enumerate(group):
        for r in xrange(n_classes):
            update[r, feat_no] = coef[r, j] - step_size * grad[r, feat_no]
            update_norm += update[r, feat_no] * update[r, feat_no]
    update_norm = np.sqrt(update_norm)

    if update_norm < tol:
        scale = 0
    else:
        mu = alpha * step_size
        scale = 1 - mu / update_norm

    if scale < 0:
        scale = 0

    for feat_no, j in enumerate(group):
        for r in xrange(n_classes):
            old = coef[r, j]
            coef[r, j] = scale * update[r, feat_no]
            update[r, feat_no] = coef[r, j] - old


@jit("void(f8[:], i4[:], i4[:], i4[:], i4[:], f8[:], f8[:, :])")
def _update_A(X_data, X_indices, X_indptr, y, group, update, A):
    """Update matrix A (see paper)."""
    n_classes = A.shape[0]

    for feat_no, j in enumerate(group):
        for r in xrange(n_classes):
            for k in xrange(X_indptr[j], X_indptr[j + 1]):
                i = X_indices[k]

                if y[i] == r:
                    continue

                A[r, i] += (update[r, feat_no] - update[y[i], feat_no]) * X_data[k]


@jit("f8(f8[:], f8[:], i4[:], f8)")
def _violation(grad, coef, group, alpha):
    """Compute optimality violation for the j^th block."""
    n_classes = grad.shape[0]

    coef_norm = 0
    grad_norm = 0

    for feat_no, j in enumerate(group):
        for r in xrange(n_classes):
            coef_norm += coef[r, j] * coef[r, j]
            grad_norm += grad[r, feat_no] * grad[r, feat_no]

    grad_norm = np.sqrt(grad_norm)

    if coef_norm == 0:
        violation = max(grad_norm - alpha, 0)
    else:
        violation = np.abs(grad_norm - alpha)

    return violation


@jit("void(f8[:], i4[:], i4[:], i4[:], i4[:], i4[:], i4, f8, f8, f8, i4, f8[:,:])")
def _fit(X_data, X_indices, X_indptr, y, group_ids, group_indptr, max_iter,
         alpha, C, tol, verbose, coef):
    n_samples = y.shape[0]
    n_groups = group_indptr.shape[0] - 1
    n_classes, n_features = coef.shape
    inv_step_sizes = np.zeros(n_groups, dtype=np.float64)
    _inv_step_sizes(X_data, X_indptr, group_ids, group_indptr,
                    C * 4 * (n_classes - 1),
                    inv_step_sizes)

    grad = np.empty((n_classes, 10), dtype=np.float64)
    update = np.zeros((n_classes, 10), dtype=np.float64)
    A = np.ones((n_classes, n_samples), dtype=np.float64)

    rs = np.random.RandomState(None)

    violation_init = 0
    for it in xrange(max_iter):
        violation_max = 0

        for _ in xrange(n_groups):
            j = rs.randint(n_groups - 1)
            group = group_ids[group_indptr[j]:group_indptr[j + 1]]
            if inv_step_sizes[j] == 0:
                continue

            grad.fill(0)
            _grad(X_data, X_indices, X_indptr, y, A, group, C, grad)
            violation = _violation(grad, coef, group, alpha)
            _update_coef(grad, group, 1. / inv_step_sizes[j], alpha,
                         update, coef, tol)
            _update_A(X_data, X_indices, X_indptr, y, group, update, A)

            if violation > violation_max:
                violation_max = violation

        if it == 0:
            violation_init = violation_max

        if violation_max == violation_init == 0:
            ratio = 0.0
        else:
            ratio = violation_max / violation_init
        if verbose >= 1:
            print("Violation {:.2f}".format(ratio))

        if ratio < tol:
            if verbose >= 1:
                print("Converged at iter", it + 1)
            break


def make_group_classification(n_samples=200, group_size=10,
                              n_classes=2, random_state=None):
    """ Make a classification dataset for sparse group lasso

    Implements Sec. 4 from "A note on the group lasso and a sparse group
    lasso", by Friedman et al.
    """
    rng = check_random_state(random_state)
    n_nonzero = [10, 8, 6, 4, 2, 1, 0, 0, 0, 0]
    n_features = len(n_nonzero) * group_size
    X = rng.randn(n_samples, n_features)
    coef = np.zeros((n_features, n_classes))
    group_starts = range(0, n_features, group_size)
    for start, this_n_nonzero in zip(group_starts, n_nonzero):
        for klass in range(n_classes):
            idx = np.arange(10)
            rng.shuffle(idx)
            coef[start:, klass][idx[:this_n_nonzero]] = rng.choice(
                                                            [-1, +1],
                                                            size=this_n_nonzero)
    y = np.argmax(np.dot(X, coef), axis=1)
    return X, y, group_starts


class SparseMulticlassClassifier(BaseEstimator, ClassifierMixin):

    def __init__(self, alpha=1, C=1, max_iter=20, tol=0.05, verbose=0):
        self.alpha = alpha
        self.C = C
        self.max_iter = max_iter
        self.tol = tol
        self.verbose = verbose


    def fit(self, X, y, groups=None):
        X = sp.csc_matrix(X)

        n_samples, n_features = X.shape

        self._enc = LabelEncoder()
        y = self._enc.fit_transform(y).astype(np.int32)
        n_classes = len(self._enc.classes_)

        self.coef_ = np.zeros((n_classes, n_features), dtype=np.float64)
        group_ids = np.hstack(groups)
        group_offset = np.cumsum(np.array([0] + [len(k) for k in groups]))
        _fit(X.data, X.indices, X.indptr, y, group_ids, group_offset,
             self.max_iter, self.alpha, self.C, self.tol, self.verbose,
             self.coef_)

        return self

    def decision_function(self, X):
        return safe_sparse_dot(X, self.coef_.T)

    def predict(self, X):
        pred = self.decision_function(X)
        pred = np.argmax(pred, axis=1)
        return self._enc.inverse_transform(pred)

    def n_nonzero(self, percentage=False):
        n_nz = np.sum(np.sum(self.coef_ != 0, axis=0, dtype=bool))

        if percentage:
            n_nz /= float(self.coef_.shape[1])

        return n_nz


if __name__ == '__main__':
    import time

    X, y, group_starts = make_group_classification(group_size=10, n_classes=3)
    groups = [range(k, k + 10) for k in group_starts]

    X = sp.csr_matrix(X)
    print(X.shape, y.shape)

    s = time.time()
    clf_ = SparseMulticlassClassifier(C=0.8 / X.shape[0], alpha=1, tol=1e-3,
                                     max_iter=50, verbose=1)
    clf_.fit(X, y, groups)
    training_time = time.time() - s

    print("Numba")
    print(training_time)
    print(clf_.score(X, y))
    print(clf_.n_nonzero(percentage=True))
    print()

    from lightning.classification import CDClassifier
    clf = CDClassifier(C=0.1 / X.shape[0], alpha=0.1, tol=1e-3, max_iter=100,
                       multiclass=True, penalty="l1/l2", shrinking=False,
                       max_steps=0, selection="uniform", verbose=0)
    s = time.time()
    clf.fit(X, y)
    training_time = time.time() - s

    print("Cython")
    print(training_time)
    print(clf.score(X, y))
    print(clf.n_nonzero(percentage=True))
    print()
	"""
	(C) August 2013, Mathieu Blondel
	# License: BSD 3 clause

	Custom group support by Vlad Niculae (vlad@vene.ro)

	This is a Numba-based reimplementation of the block coordinate descent solver
	(without line search) described in the paper:

	Block Coordinate Descent Algorithms for Large-scale Sparse Multiclass
	Classification. Mathieu Blondel, Kazuhiro Seki, and Kuniaki Uehara.
	Machine Learning, May 2013.

	The reference Cython implementation is avaible from the "primal_cd" module in:
	https://github.com/mblondel/lightning

	The reference Cython implementation appears to be roughly 2x faster than this
	implementation.
	"""
	from __future__ import print_function
	from numba import jit

	import numpy as np
	import scipy.sparse as sp

	from sklearn.base import BaseEstimator, ClassifierMixin
	from sklearn.preprocessing import LabelEncoder
	from sklearn.utils.extmath import safe_sparse_dot
	from sklearn.utils import check_random_state


	@jit("void(f8[:], i4[:], i4[:], i4[:], f8, f8[:])")
	def _inv_step_sizes(X_data, X_indptr, group_ids, group_indptr, scale, out):
	"""Compute the block-wise inverse step sizes (Lipschitz constants)."""
	for group_no in range(len(group_indptr) - 1):
	group = group_ids[group_indptr[group_no]:group_indptr[group_no + 1]]
	sqnorm = 0
	for j in group:
	for k in xrange(X_indptr[j], X_indptr[j + 1]):
	sqnorm += X_data[k] * X_data[k]
	out[group_no] = scale * sqnorm


	@jit("void(f8[:], i4[:], i4[:], i4[:], f8[:,:], i4[:], f8, f8[:])")
	def _grad(X_data, X_indices, X_indptr, y, A, group, C, out):
	"""Compute the partial gradient for the j^th block
	(vector of size n_classes x group_size)."""
	n_classes = out.shape[0]

	for feat_no, j in enumerate(group):
	for r in xrange(n_classes):
	for k in xrange(X_indptr[j], X_indptr[j + 1]):
	i = X_indices[k]

	if y[i] == r:
	continue

	if A[r, i] > 0:
	update = 2 * C * A[r, i] * X_data[k]
	out[y[i], feat_no] -= update
	out[r, feat_no] += update


	@jit("void(f8[:], i4[:], f8, f8, f8[:], f8[:, :], f8)")
	def _update_coef(grad, group, step_size, alpha, update, coef, tol):
	"""Update the j^th block of the coefficient matrix."""
	n_classes = grad.shape[0]

	update_norm = 0

	for feat_no, j in enumerate(group):
	for r in xrange(n_classes):
	update[r, feat_no] = coef[r, j] - step_size * grad[r, feat_no]
	update_norm += update[r, feat_no] * update[r, feat_no]
	update_norm = np.sqrt(update_norm)

	if update_norm < tol:
	scale = 0
	else:
	mu = alpha * step_size
	scale = 1 - mu / update_norm

	if scale < 0:
	scale = 0

	for feat_no, j in enumerate(group):
	for r in xrange(n_classes):
	old = coef[r, j]
	coef[r, j] = scale * update[r, feat_no]
	update[r, feat_no] = coef[r, j] - old


	@jit("void(f8[:], i4[:], i4[:], i4[:], i4[:], f8[:], f8[:, :])")
	def _update_A(X_data, X_indices, X_indptr, y, group, update, A):
	"""Update matrix A (see paper)."""
	n_classes = A.shape[0]

	for feat_no, j in enumerate(group):
	for r in xrange(n_classes):
	for k in xrange(X_indptr[j], X_indptr[j + 1]):
	i = X_indices[k]

	if y[i] == r:
	continue

	A[r, i] += (update[r, feat_no] - update[y[i], feat_no]) * X_data[k]


	@jit("f8(f8[:], f8[:], i4[:], f8)")
	def _violation(grad, coef, group, alpha):
	"""Compute optimality violation for the j^th block."""
	n_classes = grad.shape[0]

	coef_norm = 0
	grad_norm = 0

	for feat_no, j in enumerate(group):
	for r in xrange(n_classes):
	coef_norm += coef[r, j] * coef[r, j]
	grad_norm += grad[r, feat_no] * grad[r, feat_no]

	grad_norm = np.sqrt(grad_norm)

	if coef_norm == 0:
	violation = max(grad_norm - alpha, 0)
	else:
	violation = np.abs(grad_norm - alpha)

	return violation


	@jit("void(f8[:], i4[:], i4[:], i4[:], i4[:], i4[:], i4, f8, f8, f8, i4, f8[:,:])")
	def _fit(X_data, X_indices, X_indptr, y, group_ids, group_indptr, max_iter,
	alpha, C, tol, verbose, coef):
	n_samples = y.shape[0]
	n_groups = group_indptr.shape[0] - 1
	n_classes, n_features = coef.shape
	inv_step_sizes = np.zeros(n_groups, dtype=np.float64)
	_inv_step_sizes(X_data, X_indptr, group_ids, group_indptr,
	C * 4 * (n_classes - 1),
	inv_step_sizes)

	grad = np.empty((n_classes, 10), dtype=np.float64)
	update = np.zeros((n_classes, 10), dtype=np.float64)
	A = np.ones((n_classes, n_samples), dtype=np.float64)

	rs = np.random.RandomState(None)

	violation_init = 0
	for it in xrange(max_iter):
	violation_max = 0

	for _ in xrange(n_groups):
	j = rs.randint(n_groups - 1)
	group = group_ids[group_indptr[j]:group_indptr[j + 1]]
	if inv_step_sizes[j] == 0:
	continue

	grad.fill(0)
	_grad(X_data, X_indices, X_indptr, y, A, group, C, grad)
	violation = _violation(grad, coef, group, alpha)
	_update_coef(grad, group, 1. / inv_step_sizes[j], alpha,
	update, coef, tol)
	_update_A(X_data, X_indices, X_indptr, y, group, update, A)

	if violation > violation_max:
	violation_max = violation

	if it == 0:
	violation_init = violation_max

	if violation_max == violation_init == 0:
	ratio = 0.0
	else:
	ratio = violation_max / violation_init
	if verbose >= 1:
	print("Violation {:.2f}".format(ratio))

	if ratio < tol:
	if verbose >= 1:
	print("Converged at iter", it + 1)
	break


	def make_group_classification(n_samples=200, group_size=10,
	n_classes=2, random_state=None):
	""" Make a classification dataset for sparse group lasso

	Implements Sec. 4 from "A note on the group lasso and a sparse group
	lasso", by Friedman et al.
	"""
	rng = check_random_state(random_state)
	n_nonzero = [10, 8, 6, 4, 2, 1, 0, 0, 0, 0]
	n_features = len(n_nonzero) * group_size
	X = rng.randn(n_samples, n_features)
	coef = np.zeros((n_features, n_classes))
	group_starts = range(0, n_features, group_size)
	for start, this_n_nonzero in zip(group_starts, n_nonzero):
	for klass in range(n_classes):
	idx = np.arange(10)
	rng.shuffle(idx)
	coef[start:, klass][idx[:this_n_nonzero]] = rng.choice(
	[-1, +1],
	size=this_n_nonzero)
	y = np.argmax(np.dot(X, coef), axis=1)
	return X, y, group_starts



	class SparseMulticlassClassifier(BaseEstimator, ClassifierMixin):

	def __init__(self, alpha=1, C=1, max_iter=20, tol=0.05, verbose=0):
	self.alpha = alpha
	self.C = C
	self.max_iter = max_iter
	self.tol = tol
	self.verbose = verbose


	def fit(self, X, y, groups=None):
	X = sp.csc_matrix(X)

	n_samples, n_features = X.shape

	self._enc = LabelEncoder()
	y = self._enc.fit_transform(y).astype(np.int32)
	n_classes = len(self._enc.classes_)

	self.coef_ = np.zeros((n_classes, n_features), dtype=np.float64)
	group_ids = np.hstack(groups)
	group_offset = np.cumsum(np.array([0] + [len(k) for k in groups]))
	_fit(X.data, X.indices, X.indptr, y, group_ids, group_offset,
	self.max_iter, self.alpha, self.C, self.tol, self.verbose,
	self.coef_)

	return self

	def decision_function(self, X):
	return safe_sparse_dot(X, self.coef_.T)

	def predict(self, X):
	pred = self.decision_function(X)
	pred = np.argmax(pred, axis=1)
	return self._enc.inverse_transform(pred)

	def n_nonzero(self, percentage=False):
	n_nz = np.sum(np.sum(self.coef_ != 0, axis=0, dtype=bool))

	if percentage:
	n_nz /= float(self.coef_.shape[1])

	return n_nz


	if __name__ == '__main__':
	import time

	X, y, group_starts = make_group_classification(group_size=10, n_classes=3)
	groups = [range(k, k + 10) for k in group_starts]

	X = sp.csr_matrix(X)
	print(X.shape, y.shape)

	s = time.time()
	clf_ = SparseMulticlassClassifier(C=0.8 / X.shape[0], alpha=1, tol=1e-3,
	max_iter=50, verbose=1)
	clf_.fit(X, y, groups)
	training_time = time.time() - s

	print("Numba")
	print(training_time)
	print(clf_.score(X, y))
	print(clf_.n_nonzero(percentage=True))
	print()

	from lightning.classification import CDClassifier
	clf = CDClassifier(C=0.1 / X.shape[0], alpha=0.1, tol=1e-3, max_iter=100,
	multiclass=True, penalty="l1/l2", shrinking=False,
	max_steps=0, selection="uniform", verbose=0)
	s = time.time()
	clf.fit(X, y)
	training_time = time.time() - s

	print("Cython")
	print(training_time)
	print(clf.score(X, y))
	print(clf.n_nonzero(percentage=True))
	print()