Skip to content

Instantly share code, notes, and snippets.

@StuartGordonReid
Created June 15, 2015 14:36
Show Gist options
  • Star 18 You must be signed in to star a gist
  • Fork 11 You must be signed in to fork a gist
  • Save StuartGordonReid/90c8301f9403707a22ff to your computer and use it in GitHub Desktop.
Save StuartGordonReid/90c8301f9403707a22ff to your computer and use it in GitHub Desktop.
Implementation of various distance metrics in Python
import math
import random
import csv
import cProfile
import numpy as np
import hashlib
memoization = {}
class Similarity:
"""
This class contains instances of similarity / distance metrics. These are used in centroid based clustering
algorithms to identify similar patterns and put them into the same homogeneous sub sets
:param minimum: the minimum distance between two patterns (so you don't divide by 0)
"""
def __init__(self, minimum):
self.e = minimum
self.vector_operators = VectorOperations()
def manhattan_distance(self, p_vec, q_vec):
"""
This method implements the manhattan distance metric
:param p_vec: vector one
:param q_vec: vector two
:return: the manhattan distance between vector one and two
"""
return max(np.sum(np.fabs(p_vec - q_vec)), self.e)
def square_euclidean_distance(self, p_vec, q_vec):
"""
This method implements the squared euclidean distance metric
:param p_vec: vector one
:param q_vec: vector two
:return: the squared euclidean distance between vector one and two
"""
diff = p_vec - q_vec
return max(np.sum(diff**2), self.e)
def euclidean_distance(self, p_vec, q_vec):
"""
This method implements the euclidean distance metric
:param p_vec: vector one
:param q_vec: vector two
:return: the euclidean distance between vector one and two
"""
return max(math.sqrt(self.square_euclidean_distance(p_vec, q_vec)), self.e)
def half_square_euclidean_distance(self, p_vec, q_vec):
"""
This method implements the half squared euclidean distance metric
:param p_vec: vector one
:param q_vec: vector two
:return: the half squared euclidean distance between vector one and two
"""
return max(0.5 * self.square_euclidean_distance(p_vec, q_vec), self.e)
def cosine_similarity(self, p_vec, q_vec):
"""
This method implements the cosine similarity metric
:param p_vec: vector one
:param q_vec: vector two
:return: the cosine similarity between vector one and two
"""
pq = self.vector_operators.product(p_vec, q_vec)
p_norm = self.vector_operators.norm(p_vec)
q_norm = self.vector_operators.norm(q_vec)
return max(pq / (p_norm * q_norm), self.e)
def tanimoto_coefficient(self, p_vec, q_vec):
"""
This method implements the cosine tanimoto coefficient metric
:param p_vec: vector one
:param q_vec: vector two
:return: the tanimoto coefficient between vector one and two
"""
pq = self.vector_operators.product(p_vec, q_vec)
p_square = self.vector_operators.square(p_vec)
q_square = self.vector_operators.square(q_vec)
return max(pq / (p_square + q_square - pq), self.e)
def fractional_distance(self, p_vec, q_vec, fraction=2.0):
"""
This method implements the fractional distance metric. I have implemented memoization for this method to reduce
the number of function calls required. The net effect is that the algorithm runs 400% faster. A similar approach
can be used with any of the above distance metrics as well.
:param p_vec: vector one
:param q_vec: vector two
:param fraction: the fractional distance value (power)
:return: the fractional distance between vector one and two
"""
# memoization is used to reduce unnecessary calculations ... makes a BIG difference
memoize = True
if memoize:
key = self.get_key(p_vec, q_vec)
x = memoization.get(key)
if x is None:
diff = p_vec - q_vec
diff_fraction = diff**fraction
return max(math.pow(np.sum(diff_fraction), 1/fraction), self.e)
else:
return x
else:
diff = p_vec - q_vec
diff_fraction = diff**fraction
return max(math.pow(np.sum(diff_fraction), 1/fraction), self.e)
@staticmethod
def get_key(p_vec, q_vec):
"""
This method returns a unique hash value for two vectors. The hash value is equal to the concatenated string of
the hash value for vector one and vector two. E.g. is hash(p_vec) = 1234 and hash(q_vec) = 5678 then get_key(
p_vec, q_vec) = 12345678. Memoization improved the speed of this algorithm 400%.
:param p_vec: vector one
:param q_vec: vector two
:return: a unique hash
"""
# return str(hash(tuple(p_vec))) + str(hash(tuple(q_vec)))
return str(hashlib.sha1(p_vec)) + str(hashlib.sha1(q_vec))
class VectorOperations():
"""
This class contains useful implementations of methods which can be performed on vectors
"""
@staticmethod
def product(p_vec, q_vec):
"""
This method returns the product of two lists / vectors
:param p_vec: vector one
:param q_vec: vector two
:return: the product of p_vec and q_vec
"""
return p_vec * q_vec
@staticmethod
def square(p_vec):
"""
This method returns the square of a vector
:param p_vec: the vector to be squared
:return: the squared value of the vector
"""
return p_vec**2
@staticmethod
def norm(p_vec):
"""
This method returns the norm value of a vector
:param p_vec: the vector to be normed
:return: the norm value of the vector
"""
return np.sqrt(p_vec)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment