Skip to content

Instantly share code, notes, and snippets.

@vene
Created August 9, 2012 16:44
Show Gist options
  • Save vene/3305769 to your computer and use it in GitHub Desktop.
Save vene/3305769 to your computer and use it in GitHub Desktop.
Benchmark euclidean_distances
import numpy as np
from scipy.sparse import csr_matrix
from scipy.sparse import issparse
from sklearn.utils import atleast2d_or_csr
from sklearn.utils.extmath import safe_sparse_dot
from sklearn.metrics.pairwise import check_pairwise_arrays, euclidean_distances
from sklearn.metrics.euclidean_fast import dense_euclidean_distances, sparse_euclidean_distances
from numpy.testing import assert_array_almost_equal
def euclidean_distances_slow(X, Y=None, Y_norm_squared=None, squared=False):
"""
Considering the rows of X (and Y=X) as vectors, compute the
distance matrix between each pair of vectors.
For efficiency reasons, the euclidean distance between a pair of row
vector x and y is computed as::
dist(x, y) = sqrt(dot(x, x) - 2 * dot(x, y) + dot(y, y))
This formulation has two main advantages. First, it is computationally
efficient when dealing with sparse data. Second, if x varies but y
remains unchanged, then the right-most dot-product `dot(y, y)` can be
pre-computed.
Parameters
----------
X : {array-like, sparse matrix}, shape = [n_samples_1, n_features]
Y : {array-like, sparse matrix}, shape = [n_samples_2, n_features]
Y_norm_squared : array-like, shape = [n_samples_2], optional
Pre-computed dot-products of vectors in Y (e.g.,
``(Y**2).sum(axis=1)``)
squared : boolean, optional
Return squared Euclidean distances.
Returns
-------
distances : {array, sparse matrix}, shape = [n_samples_1, n_samples_2]
Examples
--------
>>> from sklearn.metrics.pairwise import euclidean_distances
>>> X = [[0, 1], [1, 1]]
>>> # distance between rows of X
>>> euclidean_distances(X, X)
array([[ 0., 1.],
[ 1., 0.]])
>>> # get distance to origin
>>> euclidean_distances(X, [[0, 0]])
array([[ 1. ],
[ 1.41421356]])
"""
# should not need X_norm_squared because if you could precompute that as
# well as Y, then you should just pre-compute the output and not even
# call this function.
X, Y = check_pairwise_arrays(X, Y)
if issparse(X):
XX = X.multiply(X).sum(axis=1)
else:
XX = np.sum(X * X, axis=1)[:, np.newaxis]
if X is Y: # shortcut in the common case euclidean_distances(X, X)
YY = XX.T
elif Y_norm_squared is None:
if issparse(Y):
# scipy.sparse matrices don't have element-wise scalar
# exponentiation, and tocsr has a copy kwarg only on CSR matrices.
YY = Y.copy() if isinstance(Y, csr_matrix) else Y.tocsr()
YY.data **= 2
YY = np.asarray(YY.sum(axis=1)).T
else:
YY = np.sum(Y ** 2, axis=1)[np.newaxis, :]
else:
YY = atleast2d_or_csr(Y_norm_squared)
if YY.shape != (1, Y.shape[0]):
raise ValueError(
"Incompatible dimensions for Y and Y_norm_squared")
# TODO: a faster Cython implementation would do the clipping of negative
# values in a single pass over the output matrix.
distances = safe_sparse_dot(X, Y.T, dense_output=True)
distances *= -2
distances += XX
distances += YY
np.maximum(distances, 0, distances)
if X is Y:
# Ensure that distances between vectors and themselves are set to 0.0.
# This may not be the case due to floating point rounding errors.
distances.flat[::distances.shape[0] + 1] = 0.0
return distances if squared else np.sqrt(distances)
def test_euclidean_distances():
""" Check the pairwise Euclidean distances computation"""
X = [[0]]
Y = [[1], [2]]
D = euclidean_distances(X, Y)
assert_array_almost_equal(D, [[1., 2.]])
X = csr_matrix(X)
Y = csr_matrix(Y)
D = euclidean_distances(X, Y)
assert_array_almost_equal(D, [[1., 2.]])
def main():
test_euclidean_distances()
from time import time
# random sparse
print "sparse 1000x1000:"
print '(X, Y): '
X = np.zeros((1000, 1000))
X.ravel()[np.random.random_integers(0, 1e6 - 1, 1e5)] = np.random.randn(1e5)
Y = np.zeros((1000, 1000))
Y.ravel()[np.random.random_integers(0, 1e6 - 1, 1e5)] = np.random.randn(1e5)
X = csr_matrix(X)
Y = csr_matrix(Y)
import gc
gc.collect()
t0 = time()
A = euclidean_distances_slow(X, Y)
print 'slow: ', time() - t0
t0 = time()
#XYt = safe_sparse_dot(X, Y.T, dense_output=True)
#B = sparse_euclidean_distances(X.shape[0], Y.shape[0],
# X.data, X.indices, X.indptr,
# Y.data, Y.indices, Y.indptr,
# XYt)
B = euclidean_distances(X, Y)
print 'fast: ', time() - t0
print np.sum((A - B) ** 2)
del Y, A, B
print '(X, X): '
t0 = time()
A = euclidean_distances_slow(X)
print 'slow: ', time() - t0
t0 = time()
B = euclidean_distances(X)
print 'fast: ', time() - t0
print 'error: ', np.sum((A - B) ** 2)
del A, B, X
for k in (100, 1000, 10000):
print "dense 1000x%d:" % k
print '(X, Y): '
gc.collect()
X, Y = np.random.randn(1000, k), np.random.randn(1000, k)
t0 = time()
A = euclidean_distances_slow(X, Y, squared=False)
print 'slow: ', time() - t0
t0 = time()
B = euclidean_distances(X, Y, squared=False)
print 'fast: ', time() - t0
# bare metal call
#C = np.empty((X.shape[0], Y.shape[0]), dtype=np.double)
#dense_euclidean_distances(X, Y, C, squared=False)
#print time() - t0
print 'error: ', np.sum((A - B) ** 2)
del A, B, Y
print "(X, X):"
gc.collect()
t0 = time()
A = euclidean_distances_slow(X, squared=False)
print 'slow: ', time() - t0
t0 = time()
B = euclidean_distances(X, squared=False)
print 'fast: ', time() - t0
#C = np.empty((X.shape[0], Y.shape[0]), dtype=np.double)
#dense_euclidean_distances(X, Y, C, squared=False)
#print time() - t0
print 'error: ', np.sum((A - B) ** 2)
if __name__ == '__main__':
main()
sparse 1000x1000:
(X, Y):
slow: 0.0871849060059
fast: 0.0804500579834
6.69689265991e-25
(X, X):
slow: 0.0874190330505
fast: 0.0775861740112
error: 6.74330923557e-25
dense 1000x100:
(X, Y):
slow: 0.0320308208466
fast: 0.0189578533173
error: 6.51681149247e-25
(X, X):
slow: 0.0325620174408
fast: 0.0181469917297
error: 3.12638803735e-13
dense 1000x1000:
(X, Y):
slow: 0.140306949615
fast: 0.105072021484
error: 7.70655257706e-24
(X, X):
slow: 0.11566400528
fast: 0.0698621273041
error: 6.57564669382e-10
dense 1000x10000:
(X, Y):
slow: 1.12574601173
fast: 0.911577939987
error: 1.36188340832e-22
(X, X):
slow: 1.04345297813
fast: 0.521425008774
error: 2.18351488002e-08
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment