Created
June 27, 2012 03:14
-
-
Save andrewdyates/3001099 to your computer and use it in GitHub Desktop.
mstats distances from http://projects.scipy.org/scipy/attachment/ticket/1474/mdistance.2.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
===================================================== | |
Distance computations for masked arrays (:mod:`scipy.spatial.mdistance`) | |
===================================================== | |
.. sectionauthor:: Damian Eads, Jason Merkin | |
Function Reference | |
------------------ | |
Distance matrix computation from a collection of raw observation vectors | |
stored in a rectangular array. | |
.. autosummary:: | |
:toctree: generated/ | |
mpdist -- pairwise distances between observation vectors. | |
squareform -- convert distance matrix to a condensed one and vice versa | |
Predicates for checking the validity of distance matrices, both | |
condensed and redundant. Also contained in this module are functions | |
for computing the number of observations in a distance matrix. | |
.. autosummary:: | |
:toctree: generated/ | |
is_valid_dm -- checks for a valid distance matrix | |
is_valid_y -- checks for a valid condensed distance matrix | |
num_obs_dm -- # of observations in a distance matrix | |
num_obs_y -- # of observations in a condensed distance matrix | |
Distance functions between two vectors ``u`` and ``v``. Computing | |
distances over a large collection of vectors is inefficient for these | |
functions. Use ``mpdist`` for this purpose. | |
.. autosummary:: | |
:toctree: generated/ | |
braycurtis -- the Bray-Curtis distance. | |
canberra -- the Canberra distance. | |
chebyshev -- the Chebyshev distance. | |
cityblock -- the Manhattan distance. | |
correlation -- the Correlation distance. | |
cosine -- the Cosine distance. | |
dice -- the Dice dissimilarity (boolean). | |
euclidean -- the Euclidean distance. | |
hamming -- the Hamming distance (boolean). | |
jaccard -- the Jaccard distance (boolean). | |
kulsinski -- the Kulsinski distance (boolean). | |
mahalanobis -- the Mahalanobis distance. | |
matching -- the matching dissimilarity (boolean). | |
minkowski -- the Minkowski distance. | |
rogerstanimoto -- the Rogers-Tanimoto dissimilarity (boolean). | |
russellrao -- the Russell-Rao dissimilarity (boolean). | |
seuclidean -- the normalized Euclidean distance. | |
sokalmichener -- the Sokal-Michener dissimilarity (boolean). | |
sokalsneath -- the Sokal-Sneath dissimilarity (boolean). | |
sqeuclidean -- the squared Euclidean distance. | |
yule -- the Yule dissimilarity (boolean). | |
References | |
---------- | |
.. [Sta07] "Statistics toolbox." API Reference Documentation. The MathWorks. | |
http://www.mathworks.com/access/helpdesk/help/toolbox/stats/. | |
Accessed October 1, 2007. | |
.. [Mti07] "Hierarchical clustering." API Reference Documentation. | |
The Wolfram Research, Inc. | |
http://reference.wolfram.com/mathematica/HierarchicalClustering/tutorial/HierarchicalClustering.html. | |
Accessed October 1, 2007. | |
.. [Gow69] Gower, JC and Ross, GJS. "Minimum Spanning Trees and Single Linkage | |
Cluster Analysis." Applied Statistics. 18(1): pp. 54--64. 1969. | |
.. [War63] Ward Jr, JH. "Hierarchical grouping to optimize an objective | |
function." Journal of the American Statistical Association. 58(301): | |
pp. 236--44. 1963. | |
.. [Joh66] Johnson, SC. "Hierarchical clustering schemes." Psychometrika. | |
32(2): pp. 241--54. 1966. | |
.. [Sne62] Sneath, PH and Sokal, RR. "Numerical taxonomy." Nature. 193: pp. | |
855--60. 1962. | |
.. [Bat95] Batagelj, V. "Comparing resemblance measures." Journal of | |
Classification. 12: pp. 73--90. 1995. | |
.. [Sok58] Sokal, RR and Michener, CD. "A statistical method for evaluating | |
systematic relationships." Scientific Bulletins. 38(22): | |
pp. 1409--38. 1958. | |
.. [Ede79] Edelbrock, C. "Mixture model tests of hierarchical clustering | |
algorithms: the problem of classifying everybody." Multivariate | |
Behavioral Research. 14: pp. 367--84. 1979. | |
.. [Jai88] Jain, A., and Dubes, R., "Algorithms for Clustering Data." | |
Prentice-Hall. Englewood Cliffs, NJ. 1988. | |
.. [Fis36] Fisher, RA "The use of multiple measurements in taxonomic | |
problems." Annals of Eugenics, 7(2): 179-188. 1936 | |
Copyright Notice | |
---------------- | |
Copyright (C) Damian Eads, 2007-2008, Jason Merkin 2011. New BSD License. | |
""" | |
import warnings | |
import numpy as np | |
import numpy.ma as ma | |
import _distance_wrap | |
def _nbool_correspond_all(u, v): | |
if u.dtype != v.dtype: | |
raise TypeError("Arrays being compared must be of the same data type.") | |
if u.dtype == np.int or u.dtype == np.float_ or u.dtype == np.double: | |
not_u = 1.0 - u | |
not_v = 1.0 - v | |
nff = (not_u * not_v).sum() | |
nft = (not_u * v).sum() | |
ntf = (u * not_v).sum() | |
ntt = (u * v).sum() | |
elif u.dtype == np.bool: | |
not_u = ~u | |
not_v = ~v | |
nff = (not_u & not_v).sum() | |
nft = (not_u & v).sum() | |
ntf = (u & not_v).sum() | |
ntt = (u & v).sum() | |
else: | |
raise TypeError("Arrays being compared have unknown type.") | |
return (nff, nft, ntf, ntt) | |
def _nbool_correspond_ft_tf(u, v): | |
if u.dtype == np.int or u.dtype == np.float_ or u.dtype == np.double: | |
not_u = 1.0 - u | |
not_v = 1.0 - v | |
nft = (not_u * v).sum() | |
ntf = (u * not_v).sum() | |
else: | |
not_u = ~u | |
not_v = ~v | |
nft = (not_u & v).sum() | |
ntf = (u & not_v).sum() | |
return (nft, ntf) | |
def _mcopy_array_if_base_present(a): | |
""" | |
Copies the array if its base points to a parent array. | |
""" | |
if a.base is not None: | |
return a.copy() | |
elif np.issubsctype(a, np.float32): | |
return ma.array(a, dtype=np.double) | |
else: | |
return a | |
def _mcopy_arrays_if_base_present(T): | |
""" | |
Accepts a tuple of arrays T. Copies the array T[i] if its base array | |
points to an actual array. Otherwise, the reference is just copied. | |
This is useful if the arrays are being passed to a C function that | |
does not do proper striding. | |
""" | |
l = [_mcopy_array_if_base_present(a) for a in T] | |
return l | |
def _mseuclidean(X, V, m, dm): | |
""" | |
Does masked seuclidean for a full array | |
""" | |
if V is not None: | |
V = ma.asarray(V, order='c') | |
if type(V) != ma.array: | |
raise TypeError('Variance vector V must be a numpy array') | |
if V.dtype != np.double: | |
raise TypeError('Variance vector V must contain doubles.') | |
if len(V.shape) != 1: | |
raise ValueError('Variance vector V must be one-dimensional.') | |
if V.shape[0] != n: | |
raise ValueError('Variance vector V must be of the same ' | |
'dimension as the vectors on which the distances ' | |
'are computed.') | |
# The C code doesn't do striding. | |
[VV] = _mcopy_arrays_if_base_present([_convert_to_double(V)]) | |
else: | |
VV = ma.var(X, axis=0, ddof=1) | |
k = 0 | |
for i in xrange(0, m-1): | |
for j in xrange(i+1,m): | |
d = mseuclidean(X[i],X[j],VV) | |
dm[k] = d | |
k += 1 | |
return dm | |
def mpdist(X, metric='euclidean', p=2, w=None, V=None, VI=None): | |
r""" | |
Adapted pdist to masked arrays | |
Computes the pairwise distances between m original observations in | |
n-dimensional space. Returns a condensed distance matrix Y. For | |
each :math:`i` and :math:`j` (where :math:`i<j<n`), the | |
metric ``dist(u=X[i], v=X[j])`` is computed and stored in the | |
:math:`ij`th entry. | |
See ``squareform`` for information on how to calculate the index of | |
this entry or to convert the condensed distance matrix to a | |
redundant square matrix. | |
The following are common calling conventions. | |
1. ``Y = mpdist(X, 'euclidean')`` | |
Computes the distance between m points using Euclidean distance | |
(2-norm) as the distance metric between the points. The points | |
are arranged as m n-dimensional row vectors in the matrix X. | |
2. ``Y = mpdist(X, 'minkowski', p)`` | |
Computes the distances using the Minkowski distance | |
:math:`||u-v||_p` (p-norm) where :math:`p \geq 1`. | |
3. ``Y = mpdist(X, 'cityblock')`` | |
Computes the city block or Manhattan distance between the | |
points. | |
4. ``Y = mpdist(X, 'seuclidean', V=None)`` | |
Computes the standardized Euclidean distance. The standardized | |
Euclidean distance between two n-vectors ``u`` and ``v`` is | |
.. math:: | |
\sqrt{\sum {(u_i-v_i)^2 / V[x_i]}}. | |
V is the variance vector; V[i] is the variance computed over all | |
the i'th components of the points. If not passed, it is | |
automatically computed. | |
5. ``Y = mpdist(X, 'sqeuclidean')`` | |
Computes the squared Euclidean distance :math:`||u-v||_2^2` between | |
the vectors. | |
6. ``Y = mpdist(X, 'cosine')`` | |
Computes the cosine distance between vectors u and v, | |
.. math:: | |
\frac{1 - uv^T} | |
{{|u|}_2 {|v|}_2} | |
where |*|_2 is the 2 norm of its argument *. | |
7. ``Y = mpdist(X, 'correlation')`` | |
Computes the correlation distance between vectors u and v. This is | |
.. math:: | |
\frac{1 - (u - \bar{u})(v - \bar{v})^T} | |
{{|(u - \bar{u})|}{|(v - \bar{v})|}^T} | |
where :math:`\bar{v}` is the mean of the elements of vector v. | |
8. ``Y = mpdist(X, 'hamming')`` | |
Computes the normalized Hamming distance, or the proportion of | |
those vector elements between two n-vectors ``u`` and ``v`` | |
which disagree. To save memory, the matrix ``X`` can be of type | |
boolean. | |
9. ``Y = mpdist(X, 'jaccard')`` | |
Computes the Jaccard distance between the points. Given two | |
vectors, ``u`` and ``v``, the Jaccard distance is the | |
proportion of those elements ``u[i]`` and ``v[i]`` that | |
disagree where at least one of them is non-zero. | |
10. ``Y = mpdist(X, 'chebyshev')`` | |
Computes the Chebyshev distance between the points. The | |
Chebyshev distance between two n-vectors ``u`` and ``v`` is the | |
maximum norm-1 distance between their respective elements. More | |
precisely, the distance is given by | |
.. math:: | |
d(u,v) = \max_i {|u_i-v_i|}. | |
11. ``Y = mpdist(X, 'canberra')`` | |
Computes the Canberra distance between the points. The | |
Canberra distance between two points ``u`` and ``v`` is | |
.. math:: | |
d(u,v) = \sum_u \frac{|u_i-v_i|} | |
{(|u_i|+|v_i|)} | |
12. ``Y = mpdist(X, 'braycurtis')`` | |
Computes the Bray-Curtis distance between the points. The | |
Bray-Curtis distance between two points ``u`` and ``v`` is | |
.. math:: | |
d(u,v) = \frac{\sum_i {u_i-v_i}} | |
{\sum_i {u_i+v_i}} | |
13. ``Y = mpdist(X, 'mahalanobis', VI=None)`` | |
Computes the Mahalanobis distance between the points. The | |
Mahalanobis distance between two points ``u`` and ``v`` is | |
:math:`(u-v)(1/V)(u-v)^T` where :math:`(1/V)` (the ``VI`` | |
variable) is the inverse covariance. If ``VI`` is not None, | |
``VI`` will be used as the inverse covariance matrix. | |
14. ``Y = mpdist(X, 'yule')`` | |
Computes the Yule distance between each pair of boolean | |
vectors. (see yule function documentation) | |
15. ``Y = mpdist(X, 'matching')`` | |
Computes the matching distance between each pair of boolean | |
vectors. (see matching function documentation) | |
16. ``Y = mpdist(X, 'dice')`` | |
Computes the Dice distance between each pair of boolean | |
vectors. (see dice function documentation) | |
17. ``Y = mpdist(X, 'kulsinski')`` | |
Computes the Kulsinski distance between each pair of | |
boolean vectors. (see kulsinski function documentation) | |
18. ``Y = mpdist(X, 'rogerstanimoto')`` | |
Computes the Rogers-Tanimoto distance between each pair of | |
boolean vectors. (see rogerstanimoto function documentation) | |
19. ``Y = mpdist(X, 'russellrao')`` | |
Computes the Russell-Rao distance between each pair of | |
boolean vectors. (see russellrao function documentation) | |
20. ``Y = mpdist(X, 'sokalmichener')`` | |
Computes the Sokal-Michener distance between each pair of | |
boolean vectors. (see sokalmichener function documentation) | |
21. ``Y = mpdist(X, 'sokalsneath')`` | |
Computes the Sokal-Sneath distance between each pair of | |
boolean vectors. (see sokalsneath function documentation) | |
22. ``Y = mpdist(X, 'wminkowski')`` | |
Computes the weighted Minkowski distance between each pair of | |
vectors. (see wminkowski function documentation) | |
22. ``Y = mpdist(X, f)`` | |
Computes the distance between all pairs of vectors in X | |
using the user supplied 2-arity function f. For example, | |
Euclidean distance between the vectors could be computed | |
as follows:: | |
dm = mpdist(X, (lambda u, v: np.sqrt(((u-v)*(u-v).T).sum()))) | |
Note that you should avoid passing a reference to one of | |
the distance functions defined in this library. For example,:: | |
dm = mpdist(X, sokalsneath) | |
would calculate the pair-wise distances between the vectors in | |
X using the Python function sokalsneath. This would result in | |
sokalsneath being called :math:`{n \choose 2}` times, which | |
is inefficient. Instead, the optimized C version is more | |
efficient, and we call it using the following syntax.:: | |
dm = mpdist(X, 'sokalsneath') | |
Parameters | |
---------- | |
X : ma.array | |
An m by n masked array of m original observations in an | |
n-dimensional space. | |
metric : string or function | |
The distance metric to use. The distance function can | |
be 'braycurtis', 'canberra', 'chebyshev', 'cityblock', | |
'correlation', 'cosine', 'dice', 'euclidean', 'hamming', | |
'jaccard', 'kulsinski', 'mahalanobis', 'matching', | |
'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', | |
'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule'. | |
w : ma.array | |
The weight vector (for weighted Minkowski). | |
p : double | |
The p-norm to apply (for Minkowski, weighted and unweighted) | |
V : ma.array | |
The variance vector (for standardized Euclidean). | |
VI : ma.array | |
The inverse of the covariance matrix (for Mahalanobis). | |
Returns | |
------- | |
Y : ma.array | |
A condensed masked distance matrix. | |
See Also | |
-------- | |
squareform : converts between condensed distance matrices and | |
square distance matrices. | |
""" | |
X = ma.asarray(X, order='c') | |
# The C code doesn't do striding. | |
[X] = _mcopy_arrays_if_base_present([_convert_to_double(X)]) | |
s = X.shape | |
if len(s) != 2: | |
raise ValueError('A 2-dimensional array must be passed.'); | |
m, n = s | |
dm = ma.zeros((m * (m - 1) / 2,), dtype=np.double) | |
wmink_names = ['wminkowski', 'wmi', 'wm', 'wpnorm'] | |
if w is None and (metric == wminkowski or metric in wmink_names): | |
raise ValueError('weighted minkowski requires a weight ' | |
'vector `w` to be given.') | |
if callable(metric): | |
if metric == mminkowski: | |
def dfun(u,v): return mminkowski(u, v, p) | |
elif metric == mwminkowski: | |
def dfun(u,v): return mwminkowski(u, v, p, w) | |
elif metric == mseuclidean: | |
def dfun(u,v): return mseuclidean(u, v, V) | |
elif metric == mahalanobis: | |
def dfun(u,v): return mmahalanobis(u, v, V) | |
else: | |
dfun = metric | |
k = 0 | |
for i in xrange(0, m - 1): | |
for j in xrange(i+1, m): | |
dm[k] = dfun(X[i], X[j]) | |
k = k + 1 | |
elif isinstance(metric,basestring): | |
mstr = metric.lower() | |
if metric == 'euclidean': | |
dm = mpdist(X, meuclidean) | |
elif mstr in set(['seuclidean', 'se', 's']): | |
dm = _mseuclidean(X, V, m, dm) | |
elif metric == 'sqeuclidean': | |
dm = mpdist(X, meuclidean) | |
dm = dm ** 2.0 | |
elif metric == 'braycurtis': | |
dm = mpdist(X, mbraycurtis) | |
elif metric == 'mahalanobis': | |
if VI is None: | |
V = ma.cov(X.T) | |
VI = np.linalg.inv(V) | |
else: | |
VI = ma.asarray(VI, order='c') | |
[VI] = _mcopy_arrays_if_base_present([VI]) | |
# (u-v)V^(-1)(u-v)^T | |
dm = mpdist(X, (lambda u, v: mmahalanobis(u, v, VI))) | |
elif metric == 'canberra': | |
dm = mpdist(X, mcanberra) | |
elif metric == 'cityblock': | |
dm = mpdist(X, mcityblock) | |
elif metric == 'minkowski': | |
dm = mpdist(X, mminkowski, p=p) | |
elif metric == 'wminkowski': | |
dm = mpdist(X, mwminkowski, p=p, w=w) | |
elif metric == 'cosine': | |
dm = mpdist(X, mcosine) | |
elif metric == 'correlation': | |
dm = mpdist(X, mcorrelation) | |
elif metric == 'hamming': | |
dm = mpdist(X, mhamming) | |
elif metric == 'jaccard': | |
dm = mpdist(X, mjaccard) | |
elif metric == 'chebyshev' or metric == 'chebychev': | |
dm = mpdist(X, mchebyshev) | |
elif metric == 'yule': | |
dm = mpdist(X, myule) | |
elif metric == 'matching': | |
dm = mpdist(X, mmatching) | |
elif metric == 'dice': | |
dm = mpdist(X, mdice) | |
elif metric == 'kulsinski': | |
dm = mpdist(X, mkulsinski) | |
elif metric == 'rogerstanimoto': | |
dm = mpdist(X, mrogerstanimoto) | |
elif metric == 'russellrao': | |
dm = mpdist(X, mrussellrao) | |
elif metric == 'sokalsneath': | |
dm = mpdist(X, msokalsneath) | |
elif metric == 'sokalmichener': | |
dm = mpdist(X, msokalmichener) | |
else: | |
raise ValueError('Unknown Distance Metric: %s' % mstr) | |
return dm | |
def mminkowski(u, v, p): | |
r""" | |
Computes the Minkowski distance between two vectors ``u`` and ``v``, | |
defined as | |
.. math:: | |
{||u-v||}_p = (\sum{|u_i - v_i|^p})^{1/p}. | |
Parameters | |
---------- | |
u : ndarray | |
An n-dimensional vector. | |
v : ndarray | |
An n-dimensional vector. | |
p : ndarray | |
The norm of the difference :math:`{||u-v||}_p`. | |
Returns | |
------- | |
d : double | |
The Minkowski distance between vectors ``u`` and ``v``. | |
""" | |
u = ma.asarray(u, order='c') | |
v = ma.asarray(v, order='c') | |
if p < 1: | |
raise ValueError("p must be at least 1") | |
return (abs(u-v)**p).sum() ** (1.0 / p) | |
def mwminkowski(u, v, p, w): | |
r""" | |
Computes the weighted Minkowski distance between two vectors ``u`` | |
and ``v``, defined as | |
.. math:: | |
\left(\sum{(w_i |u_i - v_i|^p)}\right)^{1/p}. | |
Parameters | |
---------- | |
u : ndarray | |
An :math:`n`-dimensional vector. | |
v : ndarray | |
An :math:`n`-dimensional vector. | |
p : ndarray | |
The norm of the difference :math:`{||u-v||}_p`. | |
w : ndarray | |
The weight vector. | |
Returns | |
------- | |
d : double | |
The Minkowski distance between vectors ``u`` and ``v``. | |
""" | |
u = ma.asarray(u, order='c') | |
v = ma.asarray(v, order='c') | |
w = ma.asarray(w) | |
if p < 1: | |
raise ValueError("p must be at least 1") | |
return ((w * abs(u-v))**p).sum() ** (1.0 / p) | |
def meuclidean(u, v): | |
""" | |
Computes the Euclidean distance between two n-vectors ``u`` and ``v``, | |
which is defined as | |
.. math:: | |
{||u-v||}_2 | |
Parameters | |
---------- | |
u : ndarray | |
An :math:`n`-dimensional vector. | |
v : ndarray | |
An :math:`n`-dimensional vector. | |
Returns | |
------- | |
d : double | |
The Euclidean distance between vectors ``u`` and ``v``. | |
""" | |
u = ma.asarray(u, order='c') | |
v = ma.asarray(v, order='c') | |
q=np.matrix(u-v) | |
return np.sqrt((q*q.T).sum()) | |
def msqeuclidean(u, v): | |
""" | |
Computes the squared Euclidean distance between two n-vectors u and v, | |
which is defined as | |
.. math:: | |
{||u-v||}_2^2. | |
Parameters | |
---------- | |
u : ndarray | |
An :math:`n`-dimensional vector. | |
v : ndarray | |
An :math:`n`-dimensional vector. | |
Returns | |
------- | |
d : double | |
The squared Euclidean distance between vectors ``u`` and ``v``. | |
""" | |
u = ma.asarray(u, order='c') | |
v = ma.asarray(v, order='c') | |
return ((u-v)*(u-v).T).sum() | |
def mcosine(u, v): | |
r""" | |
Computes the Cosine distance between two n-vectors u and v, which | |
is defined as | |
.. math:: | |
\frac{1-uv^T} | |
{||u||_2 ||v||_2}. | |
Parameters | |
---------- | |
u : ndarray | |
An :math:`n`-dimensional vector. | |
v : ndarray | |
An :math:`n`-dimensional vector. | |
Returns | |
------- | |
d : double | |
The Cosine distance between vectors ``u`` and ``v``. | |
""" | |
u = ma.asarray(u, order='c') | |
v = ma.asarray(v, order='c') | |
return (1.0 - (ma.dot(u, v.T) / \ | |
(ma.sqrt(ma.dot(u, u.T)) * ma.sqrt(ma.dot(v, v.T))))) | |
def mcorrelation(u, v): | |
r""" | |
Computes the correlation distance between two n-vectors ``u`` and | |
``v``, which is defined as | |
.. math:: | |
\frac{1 - (u - \bar{u}){(v - \bar{v})}^T} | |
{{||(u - \bar{u})||}_2 {||(v - \bar{v})||}_2^T} | |
where :math:`\bar{u}` is the mean of a vectors elements and ``n`` | |
is the common dimensionality of ``u`` and ``v``. | |
Parameters | |
---------- | |
u : ndarray | |
An :math:`n`-dimensional vector. | |
v : ndarray | |
An :math:`n`-dimensional vector. | |
Returns | |
------- | |
d : double | |
The correlation distance between vectors ``u`` and ``v``. | |
""" | |
umu = u.mean() | |
vmu = v.mean() | |
um = u - umu | |
vm = v - vmu | |
return 1.0 - (ma.dot(um, vm) / | |
(ma.sqrt(ma.dot(um, um)) \ | |
* ma.sqrt(ma.dot(vm, vm)))) | |
def mhamming(u, v): | |
r""" | |
Computes the Hamming distance between two n-vectors ``u`` and | |
``v``, which is simply the proportion of disagreeing components in | |
``u`` and ``v``. If ``u`` and ``v`` are boolean vectors, the Hamming | |
distance is | |
.. math:: | |
\frac{c_{01} + c_{10}}{n} | |
where :math:`c_{ij}` is the number of occurrences of | |
:math:`\mathtt{u[k]} = i` and :math:`\mathtt{v[k]} = j` for | |
:math:`k < n`. | |
Parameters | |
---------- | |
u : ndarray | |
An :math:`n`-dimensional vector. | |
v : ndarray | |
An :math:`n`-dimensional vector. | |
Returns | |
------- | |
d : double | |
The Hamming distance between vectors ``u`` and ``v``. | |
""" | |
u = ma.asarray(u, order='c') | |
v = ma.asarray(v, order='c') | |
return (u != v).mean() | |
def mjaccard(u, v): | |
""" | |
Computes the Jaccard-Needham dissimilarity between two boolean | |
n-vectors u and v, which is | |
.. math:: | |
\frac{c_{TF} + c_{FT}} | |
{c_{TT} + c_{FT} + c_{TF}} | |
where :math:`c_{ij}` is the number of occurrences of | |
:math:`\mathtt{u[k]} = i` and :math:`\mathtt{v[k]} = j` for | |
:math:`k < n`. | |
Parameters | |
---------- | |
u : ndarray | |
An :math:`n`-dimensional vector. | |
v : ndarray | |
An :math:`n`-dimensional vector. | |
Returns | |
------- | |
d : double | |
The Jaccard distance between vectors ``u`` and ``v``. | |
""" | |
u = ma.asarray(u, order='c') | |
v = ma.asarray(v, order='c') | |
return (np.double(np.bitwise_and((u != v), | |
np.bitwise_or(u != 0, v != 0)).sum()) | |
/ np.double(np.bitwise_or(u != 0, v != 0).sum())) | |
def mkulsinski(u, v): | |
""" | |
Computes the Kulsinski dissimilarity between two boolean n-vectors | |
u and v, which is defined as | |
.. math:: | |
\frac{c_{TF} + c_{FT} - c_{TT} + n} | |
{c_{FT} + c_{TF} + n} | |
where :math:`c_{ij}` is the number of occurrences of | |
:math:`\mathtt{u[k]} = i` and :math:`\mathtt{v[k]} = j` for | |
:math:`k < n`. | |
Parameters | |
---------- | |
u : ndarray | |
An :math:`n`-dimensional vector. | |
v : ndarray | |
An :math:`n`-dimensional vector. | |
Returns | |
------- | |
d : double | |
The Kulsinski distance between vectors ``u`` and ``v``. | |
""" | |
u = ma.asarray(u, order='c') | |
v = ma.asarray(v, order='c') | |
n = len(u) | |
(nff, nft, ntf, ntt) = _nbool_correspond_all(u, v) | |
return (ntf + nft - ntt + n) / (ntf + nft + n) | |
def mseuclidean(u, v, V): | |
""" | |
Returns the standardized Euclidean distance between two n-vectors | |
``u`` and ``v``. ``V`` is an m-dimensional vector of component | |
variances. It is usually computed among a larger collection | |
vectors. | |
Parameters | |
---------- | |
u : ndarray | |
An :math:`n`-dimensional vector. | |
v : ndarray | |
An :math:`n`-dimensional vector. | |
Returns | |
------- | |
d : double | |
The standardized Euclidean distance between vectors ``u`` and ``v``. | |
""" | |
u = ma.asarray(u, order='c') | |
v = ma.asarray(v, order='c') | |
V = ma.asarray(V, order='c') | |
if len(V.shape) != 1 or V.shape[0] != u.shape[0] or u.shape[0] != v.shape[0]: | |
raise TypeError('V must be a 1-D array of the same dimension as u and v.') | |
return ma.sqrt(((u-v)**2 / V).sum()) | |
def mcityblock(u, v): | |
r""" | |
Computes the Manhattan distance between two n-vectors u and v, | |
which is defined as | |
.. math:: | |
\sum_i {(u_i-v_i)}. | |
Parameters | |
---------- | |
u : ndarray | |
An :math:`n`-dimensional vector. | |
v : ndarray | |
An :math:`n`-dimensional vector. | |
Returns | |
------- | |
d : double | |
The City Block distance between vectors ``u`` and ``v``. | |
""" | |
u = ma.asarray(u, order='c') | |
v = ma.asarray(v, order='c') | |
return abs(u-v).sum() | |
def mmahalanobis(u, v, VI): | |
r""" | |
Computes the Mahalanobis distance between two n-vectors ``u`` and ``v``, | |
which is defiend as | |
.. math:: | |
(u-v)V^{-1}(u-v)^T | |
where ``VI`` is the inverse covariance matrix :math:`V^{-1}`. | |
Parameters | |
---------- | |
u : ndarray | |
An :math:`n`-dimensional vector. | |
v : ndarray | |
An :math:`n`-dimensional vector. | |
Returns | |
------- | |
d : double | |
The Mahalanobis distance between vectors ``u`` and ``v``. | |
""" | |
u = ma.asarray(u, order='c') | |
v = ma.asarray(v, order='c') | |
VI = ma.asarray(VI, order='c') | |
return ma.sqrt(ma.dot(ma.dot((u-v),VI),(u-v).T).sum()) | |
def mchebyshev(u, v): | |
r""" | |
Computes the Chebyshev distance between two n-vectors u and v, | |
which is defined as | |
.. math:: | |
\max_i {|u_i-v_i|}. | |
Parameters | |
---------- | |
u : ndarray | |
An :math:`n`-dimensional vector. | |
v : ndarray | |
An :math:`n`-dimensional vector. | |
Returns | |
------- | |
d : double | |
The Chebyshev distance between vectors ``u`` and ``v``. | |
""" | |
u = ma.asarray(u, order='c') | |
v = ma.asarray(v, order='c') | |
return max(abs(u-v)) | |
def mbraycurtis(u, v): | |
r""" | |
Computes the Bray-Curtis distance between two n-vectors ``u`` and | |
``v``, which is defined as | |
.. math:: | |
\sum{|u_i-v_i|} / \sum{|u_i+v_i|}. | |
Parameters | |
---------- | |
u : ndarray | |
An :math:`n`-dimensional vector. | |
v : ndarray | |
An :math:`n`-dimensional vector. | |
Returns | |
------- | |
d : double | |
The Bray-Curtis distance between vectors ``u`` and ``v``. | |
""" | |
u = ma.asarray(u, order='c') | |
v = ma.asarray(v, order='c') | |
return abs(u-v).sum() / abs(u+v).sum() | |
def mcanberra(u, v): | |
r""" | |
Computes the Canberra distance between two n-vectors u and v, | |
which is defined as | |
.. math:: | |
\frac{\sum_i {|u_i-v_i|}} | |
{\sum_i {|u_i|+|v_i|}}. | |
Parameters | |
---------- | |
u : ndarray | |
An :math:`n`-dimensional vector. | |
v : ndarray | |
An :math:`n`-dimensional vector. | |
Returns | |
------- | |
d : double | |
The Canberra distance between vectors ``u`` and ``v``. | |
""" | |
u = ma.asarray(u, order='c') | |
v = ma.asarray(v, order='c') | |
return abs(u-v).sum() / (abs(u).sum() + abs(v).sum()) | |
def myule(u, v): | |
r""" | |
Computes the Yule dissimilarity between two boolean n-vectors u and v, | |
which is defined as | |
.. math:: | |
\frac{R}{c_{TT} + c_{FF} + \frac{R}{2}} | |
where :math:`c_{ij}` is the number of occurrences of | |
:math:`\mathtt{u[k]} = i` and :math:`\mathtt{v[k]} = j` for | |
:math:`k < n` and :math:`R = 2.0 * (c_{TF} + c_{FT})`. | |
Parameters | |
---------- | |
u : ndarray | |
An :math:`n`-dimensional vector. | |
v : ndarray | |
An :math:`n`-dimensional vector. | |
Returns | |
------- | |
d : double | |
The Yule dissimilarity between vectors ``u`` and ``v``. | |
""" | |
u = ma.asarray(u, order='c') | |
v = ma.asarray(v, order='c') | |
(nff, nft, ntf, ntt) = _nbool_correspond_all(u, v) | |
return float(2.0 * ntf * nft) / float(ntt * nff + ntf * nft) | |
def mmatching(u, v): | |
r""" | |
Computes the Matching dissimilarity between two boolean n-vectors | |
u and v, which is defined as | |
.. math:: | |
\frac{c_{TF} + c_{FT}}{n} | |
where :math:`c_{ij}` is the number of occurrences of | |
:math:`\mathtt{u[k]} = i` and :math:`\mathtt{v[k]} = j` for | |
:math:`k < n`. | |
Parameters | |
---------- | |
u : ndarray | |
An :math:`n`-dimensional vector. | |
v : ndarray | |
An :math:`n`-dimensional vector. | |
Returns | |
------- | |
d : double | |
The Matching dissimilarity between vectors ``u`` and ``v``. | |
""" | |
u = ma.asarray(u, order='c') | |
v = ma.asarray(v, order='c') | |
(nft, ntf) = _nbool_correspond_ft_tf(u, v) | |
# len needed combined masks | |
return float(nft + ntf) / float(len(u + v)) | |
def mdice(u, v): | |
r""" | |
Computes the Dice dissimilarity between two boolean n-vectors | |
``u`` and ``v``, which is | |
.. math:: | |
\frac{c_{TF} + c_{FT}} | |
{2c_{TT} + c_{FT} + c_{TF}} | |
where :math:`c_{ij}` is the number of occurrences of | |
:math:`\mathtt{u[k]} = i` and :math:`\mathtt{v[k]} = j` for | |
:math:`k < n`. | |
Parameters | |
---------- | |
u : ndarray | |
An :math:`n`-dimensional vector. | |
v : ndarray | |
An :math:`n`-dimensional vector. | |
Returns | |
------- | |
d : double | |
The Dice dissimilarity between vectors ``u`` and ``v``. | |
""" | |
u = ma.asarray(u, order='c') | |
v = ma.asarray(v, order='c') | |
if u.dtype == np.bool: | |
ntt = (u & v).sum() | |
else: | |
ntt = (u * v).sum() | |
(nft, ntf) = _nbool_correspond_ft_tf(u, v) | |
return float(ntf + nft) / float(2.0 * ntt + ntf + nft) | |
def mrogerstanimoto(u, v): | |
r""" | |
Computes the Rogers-Tanimoto dissimilarity between two boolean | |
n-vectors ``u`` and ``v``, which is defined as | |
.. math:: | |
\frac{R} | |
{c_{TT} + c_{FF} + R} | |
where :math:`c_{ij}` is the number of occurrences of | |
:math:`\mathtt{u[k]} = i` and :math:`\mathtt{v[k]} = j` for | |
:math:`k < n` and :math:`R = 2(c_{TF} + c_{FT})`. | |
Parameters | |
---------- | |
u : ndarray | |
An :math:`n`-dimensional vector. | |
v : ndarray | |
An :math:`n`-dimensional vector. | |
Returns | |
------- | |
d : double | |
The Rogers-Tanimoto dissimilarity between vectors | |
`u` and `v`. | |
""" | |
u = ma.asarray(u, order='c') | |
v = ma.asarray(v, order='c') | |
(nff, nft, ntf, ntt) = _nbool_correspond_all(u, v) | |
return float(2.0 * (ntf + nft)) / float(ntt + nff + (2.0 * (ntf + nft))) | |
def mrussellrao(u, v): | |
r""" | |
Computes the Russell-Rao dissimilarity between two boolean n-vectors | |
``u`` and ``v``, which is defined as | |
.. math:: | |
\frac{n - c_{TT}} | |
{n} | |
where :math:`c_{ij}` is the number of occurrences of | |
:math:`\mathtt{u[k]} = i` and :math:`\mathtt{v[k]} = j` for | |
:math:`k < n`. | |
Parameters | |
---------- | |
u : ndarray | |
An :math:`n`-dimensional vector. | |
v : ndarray | |
An :math:`n`-dimensional vector. | |
Returns | |
------- | |
d : double | |
The Russell-Rao dissimilarity between vectors ``u`` and ``v``. | |
""" | |
u = ma.asarray(u, order='c') | |
v = ma.asarray(v, order='c') | |
if u.dtype == np.bool: | |
ntt = (u & v).sum() | |
else: | |
ntt = (u * v).sum() | |
return float(len(u) - ntt) / float(len(u)) | |
def msokalmichener(u, v): | |
r""" | |
Computes the Sokal-Michener dissimilarity between two boolean vectors | |
``u`` and ``v``, which is defined as | |
.. math:: | |
\frac{R} | |
{S + R} | |
where :math:`c_{ij}` is the number of occurrences of | |
:math:`\mathtt{u[k]} = i` and :math:`\mathtt{v[k]} = j` for | |
:math:`k < n`, :math:`R = 2 * (c_{TF} + c_{FT})` and | |
:math:`S = c_{FF} + c_{TT}`. | |
Parameters | |
---------- | |
u : ndarray | |
An :math:`n`-dimensional vector. | |
v : ndarray | |
An :math:`n`-dimensional vector. | |
Returns | |
------- | |
d : double | |
The Sokal-Michener dissimilarity between vectors ``u`` and ``v``. | |
""" | |
u = ma.asarray(u, order='c') | |
v = ma.asarray(v, order='c') | |
if u.dtype == np.bool: | |
ntt = (u & v).sum() | |
nff = (~u & ~v).sum() | |
else: | |
ntt = (u * v).sum() | |
nff = ((1.0 - u) * (1.0 - v)).sum() | |
(nft, ntf) = _nbool_correspond_ft_tf(u, v) | |
return float(2.0 * (ntf + nft))/float(ntt + nff + 2.0 * (ntf + nft)) | |
def msokalsneath(u, v): | |
r""" | |
Computes the Sokal-Sneath dissimilarity between two boolean vectors | |
``u`` and ``v``, | |
.. math:: | |
\frac{R} | |
{c_{TT} + R} | |
where :math:`c_{ij}` is the number of occurrences of | |
:math:`\mathtt{u[k]} = i` and :math:`\mathtt{v[k]} = j` for | |
:math:`k < n` and :math:`R = 2(c_{TF} + c_{FT})`. | |
Parameters | |
---------- | |
u : ndarray | |
An :math:`n`-dimensional vector. | |
v : ndarray | |
An :math:`n`-dimensional vector. | |
Returns | |
------- | |
d : double | |
The Sokal-Sneath dissimilarity between vectors ``u`` and ``v``. | |
""" | |
u = ma.asarray(u, order='c') | |
v = ma.asarray(v, order='c') | |
if u.dtype == np.bool: | |
ntt = (u & v).sum() | |
else: | |
ntt = (u * v).sum() | |
(nft, ntf) = _nbool_correspond_ft_tf(u, v) | |
denom = ntt + 2.0 * (ntf + nft) | |
if denom == 0: | |
raise ValueError('Sokal-Sneath dissimilarity is not defined for ' | |
'vectors that are entirely false.') | |
return float(2.0 * (ntf + nft)) / denom | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment