vene/bench_eucl.py

## bench_eucl.py
import numpy as np
from scipy.sparse import csr_matrix
from scipy.sparse import issparse

from sklearn.utils import atleast2d_or_csr
from sklearn.utils.extmath import safe_sparse_dot
from sklearn.metrics.pairwise import check_pairwise_arrays, euclidean_distances
from sklearn.metrics.euclidean_fast import dense_euclidean_distances, sparse_euclidean_distances


from numpy.testing import assert_array_almost_equal


def euclidean_distances_slow(X, Y=None, Y_norm_squared=None, squared=False):
    """
    Considering the rows of X (and Y=X) as vectors, compute the
    distance matrix between each pair of vectors.

    For efficiency reasons, the euclidean distance between a pair of row
    vector x and y is computed as::

        dist(x, y) = sqrt(dot(x, x) - 2 * dot(x, y) + dot(y, y))

    This formulation has two main advantages. First, it is computationally
    efficient when dealing with sparse data. Second, if x varies but y
    remains unchanged, then the right-most dot-product `dot(y, y)` can be
    pre-computed.

    Parameters
    ----------
    X : {array-like, sparse matrix}, shape = [n_samples_1, n_features]

    Y : {array-like, sparse matrix}, shape = [n_samples_2, n_features]

    Y_norm_squared : array-like, shape = [n_samples_2], optional
        Pre-computed dot-products of vectors in Y (e.g.,
        ``(Y**2).sum(axis=1)``)

    squared : boolean, optional
        Return squared Euclidean distances.

    Returns
    -------
    distances : {array, sparse matrix}, shape = [n_samples_1, n_samples_2]

    Examples
    --------
    >>> from sklearn.metrics.pairwise import euclidean_distances
    >>> X = [[0, 1], [1, 1]]
    >>> # distance between rows of X
    >>> euclidean_distances(X, X)
    array([[ 0.,  1.],
           [ 1.,  0.]])
    >>> # get distance to origin
    >>> euclidean_distances(X, [[0, 0]])
    array([[ 1.        ],
           [ 1.41421356]])
    """
    # should not need X_norm_squared because if you could precompute that as
    # well as Y, then you should just pre-compute the output and not even
    # call this function.
    X, Y = check_pairwise_arrays(X, Y)
    if issparse(X):
        XX = X.multiply(X).sum(axis=1)
    else:
        XX = np.sum(X * X, axis=1)[:, np.newaxis]

    if X is Y:  # shortcut in the common case euclidean_distances(X, X)
        YY = XX.T
    elif Y_norm_squared is None:
        if issparse(Y):
            # scipy.sparse matrices don't have element-wise scalar
            # exponentiation, and tocsr has a copy kwarg only on CSR matrices.
            YY = Y.copy() if isinstance(Y, csr_matrix) else Y.tocsr()
            YY.data **= 2
            YY = np.asarray(YY.sum(axis=1)).T
        else:
            YY = np.sum(Y ** 2, axis=1)[np.newaxis, :]
    else:
        YY = atleast2d_or_csr(Y_norm_squared)
        if YY.shape != (1, Y.shape[0]):
            raise ValueError(
                        "Incompatible dimensions for Y and Y_norm_squared")

    # TODO: a faster Cython implementation would do the clipping of negative
    # values in a single pass over the output matrix.
    distances = safe_sparse_dot(X, Y.T, dense_output=True)
    distances *= -2
    distances += XX
    distances += YY
    np.maximum(distances, 0, distances)

    if X is Y:
        # Ensure that distances between vectors and themselves are set to 0.0.
        # This may not be the case due to floating point rounding errors.
        distances.flat[::distances.shape[0] + 1] = 0.0

    return distances if squared else np.sqrt(distances)


def test_euclidean_distances():
    """ Check the pairwise Euclidean distances computation"""
    X = [[0]]
    Y = [[1], [2]]
    D = euclidean_distances(X, Y)
    assert_array_almost_equal(D, [[1., 2.]])

    X = csr_matrix(X)
    Y = csr_matrix(Y)
    D = euclidean_distances(X, Y)
    assert_array_almost_equal(D, [[1., 2.]])


def main():
    test_euclidean_distances()
    from time import time
    # random sparse

    print "sparse 1000x1000:"
    print '(X, Y): '
    X = np.zeros((1000, 1000))
    X.ravel()[np.random.random_integers(0, 1e6 - 1, 1e5)] = np.random.randn(1e5)
    Y = np.zeros((1000, 1000))
    Y.ravel()[np.random.random_integers(0, 1e6 - 1, 1e5)] = np.random.randn(1e5)
    X = csr_matrix(X)
    Y = csr_matrix(Y)
    import gc
    gc.collect()
    t0 = time()
    A = euclidean_distances_slow(X, Y)
    print 'slow: ', time() - t0
    t0 = time()
    #XYt = safe_sparse_dot(X, Y.T, dense_output=True)
    #B = sparse_euclidean_distances(X.shape[0], Y.shape[0],
    #                                  X.data, X.indices, X.indptr,
    #                                  Y.data, Y.indices, Y.indptr,
    #                                  XYt)
    B = euclidean_distances(X, Y)
    print 'fast: ', time() - t0
    print np.sum((A - B) ** 2)
    del Y, A, B
    print '(X, X): '
    t0 = time()
    A = euclidean_distances_slow(X)
    print 'slow: ', time() - t0
    t0 = time()
    B = euclidean_distances(X)
    print 'fast: ', time() - t0
    print 'error: ', np.sum((A - B) ** 2)
    del A, B, X
    for k in (100, 1000, 10000):
        print "dense 1000x%d:" % k
        print '(X, Y): '
        gc.collect()
        X, Y = np.random.randn(1000, k), np.random.randn(1000, k)
        t0 = time()
        A = euclidean_distances_slow(X, Y, squared=False)
        print 'slow: ', time() - t0
        t0 = time()
        B = euclidean_distances(X, Y, squared=False)
        print 'fast: ', time() - t0
        # bare metal call
        #C = np.empty((X.shape[0], Y.shape[0]), dtype=np.double)
        #dense_euclidean_distances(X, Y, C, squared=False)
        #print time() - t0
        print 'error: ', np.sum((A - B) ** 2)
        del A, B, Y
        print "(X, X):"
        gc.collect()
        t0 = time()
        A = euclidean_distances_slow(X, squared=False)
        print 'slow: ', time() - t0
        t0 = time()
        B = euclidean_distances(X, squared=False)
        print 'fast: ', time() - t0
        #C = np.empty((X.shape[0], Y.shape[0]), dtype=np.double)
        #dense_euclidean_distances(X, Y, C, squared=False)
        #print time() - t0
        print 'error: ', np.sum((A - B) ** 2)


if __name__ == '__main__':
    main()

## output.txt
sparse 1000x1000:
(X, Y):
slow:  0.0871849060059
fast:  0.0804500579834
6.69689265991e-25
(X, X):
slow:  0.0874190330505
fast:  0.0775861740112
error:  6.74330923557e-25
dense 1000x100:
(X, Y):
slow:  0.0320308208466
fast:  0.0189578533173
error:  6.51681149247e-25
(X, X):
slow:  0.0325620174408
fast:  0.0181469917297
error:  3.12638803735e-13
dense 1000x1000:
(X, Y):
slow:  0.140306949615
fast:  0.105072021484
error:  7.70655257706e-24
(X, X):
slow:  0.11566400528
fast:  0.0698621273041
error:  6.57564669382e-10
dense 1000x10000:
(X, Y):
slow:  1.12574601173
fast:  0.911577939987
error:  1.36188340832e-22
(X, X):
slow:  1.04345297813
fast:  0.521425008774
error:  2.18351488002e-08
	import numpy as np
	from scipy.sparse import csr_matrix
	from scipy.sparse import issparse

	from sklearn.utils import atleast2d_or_csr
	from sklearn.utils.extmath import safe_sparse_dot
	from sklearn.metrics.pairwise import check_pairwise_arrays, euclidean_distances
	from sklearn.metrics.euclidean_fast import dense_euclidean_distances, sparse_euclidean_distances


	from numpy.testing import assert_array_almost_equal


	def euclidean_distances_slow(X, Y=None, Y_norm_squared=None, squared=False):
	"""
	Considering the rows of X (and Y=X) as vectors, compute the
	distance matrix between each pair of vectors.

	For efficiency reasons, the euclidean distance between a pair of row
	vector x and y is computed as::

	dist(x, y) = sqrt(dot(x, x) - 2 * dot(x, y) + dot(y, y))

	This formulation has two main advantages. First, it is computationally
	efficient when dealing with sparse data. Second, if x varies but y
	remains unchanged, then the right-most dot-product `dot(y, y)` can be
	pre-computed.

	Parameters
	----------
	X : {array-like, sparse matrix}, shape = [n_samples_1, n_features]

	Y : {array-like, sparse matrix}, shape = [n_samples_2, n_features]

	Y_norm_squared : array-like, shape = [n_samples_2], optional
	Pre-computed dot-products of vectors in Y (e.g.,
	``(Y**2).sum(axis=1)``)

	squared : boolean, optional
	Return squared Euclidean distances.

	Returns
	-------
	distances : {array, sparse matrix}, shape = [n_samples_1, n_samples_2]

	Examples
	--------
	>>> from sklearn.metrics.pairwise import euclidean_distances
	>>> X = [[0, 1], [1, 1]]
	>>> # distance between rows of X
	>>> euclidean_distances(X, X)
	array([[ 0., 1.],
	[ 1., 0.]])
	>>> # get distance to origin
	>>> euclidean_distances(X, [[0, 0]])
	array([[ 1. ],
	[ 1.41421356]])
	"""
	# should not need X_norm_squared because if you could precompute that as
	# well as Y, then you should just pre-compute the output and not even
	# call this function.
	X, Y = check_pairwise_arrays(X, Y)
	if issparse(X):
	XX = X.multiply(X).sum(axis=1)
	else:
	XX = np.sum(X * X, axis=1)[:, np.newaxis]

	if X is Y: # shortcut in the common case euclidean_distances(X, X)
	YY = XX.T
	elif Y_norm_squared is None:
	if issparse(Y):
	# scipy.sparse matrices don't have element-wise scalar
	# exponentiation, and tocsr has a copy kwarg only on CSR matrices.
	YY = Y.copy() if isinstance(Y, csr_matrix) else Y.tocsr()
	YY.data **= 2
	YY = np.asarray(YY.sum(axis=1)).T
	else:
	YY = np.sum(Y ** 2, axis=1)[np.newaxis, :]
	else:
	YY = atleast2d_or_csr(Y_norm_squared)
	if YY.shape != (1, Y.shape[0]):
	raise ValueError(
	"Incompatible dimensions for Y and Y_norm_squared")

	# TODO: a faster Cython implementation would do the clipping of negative
	# values in a single pass over the output matrix.
	distances = safe_sparse_dot(X, Y.T, dense_output=True)
	distances *= -2
	distances += XX
	distances += YY
	np.maximum(distances, 0, distances)

	if X is Y:
	# Ensure that distances between vectors and themselves are set to 0.0.
	# This may not be the case due to floating point rounding errors.
	distances.flat[::distances.shape[0] + 1] = 0.0

	return distances if squared else np.sqrt(distances)


	def test_euclidean_distances():
	""" Check the pairwise Euclidean distances computation"""
	X = [[0]]
	Y = [[1], [2]]
	D = euclidean_distances(X, Y)
	assert_array_almost_equal(D, [[1., 2.]])

	X = csr_matrix(X)
	Y = csr_matrix(Y)
	D = euclidean_distances(X, Y)
	assert_array_almost_equal(D, [[1., 2.]])


	def main():
	test_euclidean_distances()
	from time import time
	# random sparse

	print "sparse 1000x1000:"
	print '(X, Y): '
	X = np.zeros((1000, 1000))
	X.ravel()[np.random.random_integers(0, 1e6 - 1, 1e5)] = np.random.randn(1e5)
	Y = np.zeros((1000, 1000))
	Y.ravel()[np.random.random_integers(0, 1e6 - 1, 1e5)] = np.random.randn(1e5)
	X = csr_matrix(X)
	Y = csr_matrix(Y)
	import gc
	gc.collect()
	t0 = time()
	A = euclidean_distances_slow(X, Y)
	print 'slow: ', time() - t0
	t0 = time()
	#XYt = safe_sparse_dot(X, Y.T, dense_output=True)
	#B = sparse_euclidean_distances(X.shape[0], Y.shape[0],
	# X.data, X.indices, X.indptr,
	# Y.data, Y.indices, Y.indptr,
	# XYt)
	B = euclidean_distances(X, Y)
	print 'fast: ', time() - t0
	print np.sum((A - B) ** 2)
	del Y, A, B
	print '(X, X): '
	t0 = time()
	A = euclidean_distances_slow(X)
	print 'slow: ', time() - t0
	t0 = time()
	B = euclidean_distances(X)
	print 'fast: ', time() - t0
	print 'error: ', np.sum((A - B) ** 2)
	del A, B, X
	for k in (100, 1000, 10000):
	print "dense 1000x%d:" % k
	print '(X, Y): '
	gc.collect()
	X, Y = np.random.randn(1000, k), np.random.randn(1000, k)
	t0 = time()
	A = euclidean_distances_slow(X, Y, squared=False)
	print 'slow: ', time() - t0
	t0 = time()
	B = euclidean_distances(X, Y, squared=False)
	print 'fast: ', time() - t0
	# bare metal call
	#C = np.empty((X.shape[0], Y.shape[0]), dtype=np.double)
	#dense_euclidean_distances(X, Y, C, squared=False)
	#print time() - t0
	print 'error: ', np.sum((A - B) ** 2)
	del A, B, Y
	print "(X, X):"
	gc.collect()
	t0 = time()
	A = euclidean_distances_slow(X, squared=False)
	print 'slow: ', time() - t0
	t0 = time()
	B = euclidean_distances(X, squared=False)
	print 'fast: ', time() - t0
	#C = np.empty((X.shape[0], Y.shape[0]), dtype=np.double)
	#dense_euclidean_distances(X, Y, C, squared=False)
	#print time() - t0
	print 'error: ', np.sum((A - B) ** 2)


	if __name__ == '__main__':
	main()
	sparse 1000x1000:
	(X, Y):
	slow: 0.0871849060059
	fast: 0.0804500579834
	6.69689265991e-25
	(X, X):
	slow: 0.0874190330505
	fast: 0.0775861740112
	error: 6.74330923557e-25
	dense 1000x100:
	(X, Y):
	slow: 0.0320308208466
	fast: 0.0189578533173
	error: 6.51681149247e-25
	(X, X):
	slow: 0.0325620174408
	fast: 0.0181469917297
	error: 3.12638803735e-13
	dense 1000x1000:
	(X, Y):
	slow: 0.140306949615
	fast: 0.105072021484
	error: 7.70655257706e-24
	(X, X):
	slow: 0.11566400528
	fast: 0.0698621273041
	error: 6.57564669382e-10
	dense 1000x10000:
	(X, Y):
	slow: 1.12574601173
	fast: 0.911577939987
	error: 1.36188340832e-22
	(X, X):
	slow: 1.04345297813
	fast: 0.521425008774
	error: 2.18351488002e-08