Created
August 9, 2012 16:44
-
-
Save vene/3305769 to your computer and use it in GitHub Desktop.
Benchmark euclidean_distances
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
from scipy.sparse import csr_matrix | |
from scipy.sparse import issparse | |
from sklearn.utils import atleast2d_or_csr | |
from sklearn.utils.extmath import safe_sparse_dot | |
from sklearn.metrics.pairwise import check_pairwise_arrays, euclidean_distances | |
from sklearn.metrics.euclidean_fast import dense_euclidean_distances, sparse_euclidean_distances | |
from numpy.testing import assert_array_almost_equal | |
def euclidean_distances_slow(X, Y=None, Y_norm_squared=None, squared=False): | |
""" | |
Considering the rows of X (and Y=X) as vectors, compute the | |
distance matrix between each pair of vectors. | |
For efficiency reasons, the euclidean distance between a pair of row | |
vector x and y is computed as:: | |
dist(x, y) = sqrt(dot(x, x) - 2 * dot(x, y) + dot(y, y)) | |
This formulation has two main advantages. First, it is computationally | |
efficient when dealing with sparse data. Second, if x varies but y | |
remains unchanged, then the right-most dot-product `dot(y, y)` can be | |
pre-computed. | |
Parameters | |
---------- | |
X : {array-like, sparse matrix}, shape = [n_samples_1, n_features] | |
Y : {array-like, sparse matrix}, shape = [n_samples_2, n_features] | |
Y_norm_squared : array-like, shape = [n_samples_2], optional | |
Pre-computed dot-products of vectors in Y (e.g., | |
``(Y**2).sum(axis=1)``) | |
squared : boolean, optional | |
Return squared Euclidean distances. | |
Returns | |
------- | |
distances : {array, sparse matrix}, shape = [n_samples_1, n_samples_2] | |
Examples | |
-------- | |
>>> from sklearn.metrics.pairwise import euclidean_distances | |
>>> X = [[0, 1], [1, 1]] | |
>>> # distance between rows of X | |
>>> euclidean_distances(X, X) | |
array([[ 0., 1.], | |
[ 1., 0.]]) | |
>>> # get distance to origin | |
>>> euclidean_distances(X, [[0, 0]]) | |
array([[ 1. ], | |
[ 1.41421356]]) | |
""" | |
# should not need X_norm_squared because if you could precompute that as | |
# well as Y, then you should just pre-compute the output and not even | |
# call this function. | |
X, Y = check_pairwise_arrays(X, Y) | |
if issparse(X): | |
XX = X.multiply(X).sum(axis=1) | |
else: | |
XX = np.sum(X * X, axis=1)[:, np.newaxis] | |
if X is Y: # shortcut in the common case euclidean_distances(X, X) | |
YY = XX.T | |
elif Y_norm_squared is None: | |
if issparse(Y): | |
# scipy.sparse matrices don't have element-wise scalar | |
# exponentiation, and tocsr has a copy kwarg only on CSR matrices. | |
YY = Y.copy() if isinstance(Y, csr_matrix) else Y.tocsr() | |
YY.data **= 2 | |
YY = np.asarray(YY.sum(axis=1)).T | |
else: | |
YY = np.sum(Y ** 2, axis=1)[np.newaxis, :] | |
else: | |
YY = atleast2d_or_csr(Y_norm_squared) | |
if YY.shape != (1, Y.shape[0]): | |
raise ValueError( | |
"Incompatible dimensions for Y and Y_norm_squared") | |
# TODO: a faster Cython implementation would do the clipping of negative | |
# values in a single pass over the output matrix. | |
distances = safe_sparse_dot(X, Y.T, dense_output=True) | |
distances *= -2 | |
distances += XX | |
distances += YY | |
np.maximum(distances, 0, distances) | |
if X is Y: | |
# Ensure that distances between vectors and themselves are set to 0.0. | |
# This may not be the case due to floating point rounding errors. | |
distances.flat[::distances.shape[0] + 1] = 0.0 | |
return distances if squared else np.sqrt(distances) | |
def test_euclidean_distances(): | |
""" Check the pairwise Euclidean distances computation""" | |
X = [[0]] | |
Y = [[1], [2]] | |
D = euclidean_distances(X, Y) | |
assert_array_almost_equal(D, [[1., 2.]]) | |
X = csr_matrix(X) | |
Y = csr_matrix(Y) | |
D = euclidean_distances(X, Y) | |
assert_array_almost_equal(D, [[1., 2.]]) | |
def main(): | |
test_euclidean_distances() | |
from time import time | |
# random sparse | |
print "sparse 1000x1000:" | |
print '(X, Y): ' | |
X = np.zeros((1000, 1000)) | |
X.ravel()[np.random.random_integers(0, 1e6 - 1, 1e5)] = np.random.randn(1e5) | |
Y = np.zeros((1000, 1000)) | |
Y.ravel()[np.random.random_integers(0, 1e6 - 1, 1e5)] = np.random.randn(1e5) | |
X = csr_matrix(X) | |
Y = csr_matrix(Y) | |
import gc | |
gc.collect() | |
t0 = time() | |
A = euclidean_distances_slow(X, Y) | |
print 'slow: ', time() - t0 | |
t0 = time() | |
#XYt = safe_sparse_dot(X, Y.T, dense_output=True) | |
#B = sparse_euclidean_distances(X.shape[0], Y.shape[0], | |
# X.data, X.indices, X.indptr, | |
# Y.data, Y.indices, Y.indptr, | |
# XYt) | |
B = euclidean_distances(X, Y) | |
print 'fast: ', time() - t0 | |
print np.sum((A - B) ** 2) | |
del Y, A, B | |
print '(X, X): ' | |
t0 = time() | |
A = euclidean_distances_slow(X) | |
print 'slow: ', time() - t0 | |
t0 = time() | |
B = euclidean_distances(X) | |
print 'fast: ', time() - t0 | |
print 'error: ', np.sum((A - B) ** 2) | |
del A, B, X | |
for k in (100, 1000, 10000): | |
print "dense 1000x%d:" % k | |
print '(X, Y): ' | |
gc.collect() | |
X, Y = np.random.randn(1000, k), np.random.randn(1000, k) | |
t0 = time() | |
A = euclidean_distances_slow(X, Y, squared=False) | |
print 'slow: ', time() - t0 | |
t0 = time() | |
B = euclidean_distances(X, Y, squared=False) | |
print 'fast: ', time() - t0 | |
# bare metal call | |
#C = np.empty((X.shape[0], Y.shape[0]), dtype=np.double) | |
#dense_euclidean_distances(X, Y, C, squared=False) | |
#print time() - t0 | |
print 'error: ', np.sum((A - B) ** 2) | |
del A, B, Y | |
print "(X, X):" | |
gc.collect() | |
t0 = time() | |
A = euclidean_distances_slow(X, squared=False) | |
print 'slow: ', time() - t0 | |
t0 = time() | |
B = euclidean_distances(X, squared=False) | |
print 'fast: ', time() - t0 | |
#C = np.empty((X.shape[0], Y.shape[0]), dtype=np.double) | |
#dense_euclidean_distances(X, Y, C, squared=False) | |
#print time() - t0 | |
print 'error: ', np.sum((A - B) ** 2) | |
if __name__ == '__main__': | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
sparse 1000x1000: | |
(X, Y): | |
slow: 0.0871849060059 | |
fast: 0.0804500579834 | |
6.69689265991e-25 | |
(X, X): | |
slow: 0.0874190330505 | |
fast: 0.0775861740112 | |
error: 6.74330923557e-25 | |
dense 1000x100: | |
(X, Y): | |
slow: 0.0320308208466 | |
fast: 0.0189578533173 | |
error: 6.51681149247e-25 | |
(X, X): | |
slow: 0.0325620174408 | |
fast: 0.0181469917297 | |
error: 3.12638803735e-13 | |
dense 1000x1000: | |
(X, Y): | |
slow: 0.140306949615 | |
fast: 0.105072021484 | |
error: 7.70655257706e-24 | |
(X, X): | |
slow: 0.11566400528 | |
fast: 0.0698621273041 | |
error: 6.57564669382e-10 | |
dense 1000x10000: | |
(X, Y): | |
slow: 1.12574601173 | |
fast: 0.911577939987 | |
error: 1.36188340832e-22 | |
(X, X): | |
slow: 1.04345297813 | |
fast: 0.521425008774 | |
error: 2.18351488002e-08 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment