Last active
July 30, 2023 15:29
-
-
Save estebanz01/62e4b1348086e24f5fe6c686a19c53f5 to your computer and use it in GitHub Desktop.
Some distance metrics implemented in plain Elixir, with tests. Blog in spanish: https://estebanz.co/posts/distance-metrics-in-elixir/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
defmodule Distances do | |
def euclidean(a, b) when length(a) > 0 and length(b) > 0 and length(a) == length(b) do | |
# We zip both lists into one, then we reduce it to the euclidean calculation. | |
# Enum.zip_reduce(a, b, 0, fn) <- as a shortcut. | |
Stream.zip(a, b) | |
|> Enum.reduce(0, fn {x, y}, accumulator -> | |
accumulator + (x - y) ** 2 | |
end) | |
|> :math.sqrt() | |
end | |
def manhattan(a, b) when length(a) > 0 and length(b) > 0 and length(a) == length(b) do | |
Stream.zip(a, b) | |
|> Enum.reduce(0, fn {x,y}, accumulator -> accumulator + abs(x - y) end) | |
end | |
def chebyshev(a, b) when length(a) > 0 and length(b) > 0 and length(a) == length(b) do | |
Stream.zip(a, b) | |
|> Stream.map(fn {x,y} -> abs(x - y) end) | |
|> Enum.max() | |
end | |
def minkowski(a, b, p) when length(a) > 0 and length(b) > 0 and length(a) == length(b) do | |
case p do | |
1 -> manhattan(a, b) | |
2 -> euclidean(a, b) | |
_ -> | |
Stream.zip(a, b) | |
|> Enum.reduce(0, fn {x, y}, accumulator -> | |
accumulator + (abs(x - y) ** p) | |
end) |> (fn x -> x ** (1.0/p) end).() | |
end | |
end | |
def hamming(a, b) when length(a) > 0 and length(b) > 0 and length(a) == length(b) do | |
# Only count the ones that are different, which is the same as doing a summation on the signal output. | |
Stream.zip(a,b) |> Enum.count(fn {x, y} -> x != y end) | |
end | |
def dot_product(a, b) do | |
Stream.zip(a, b) | |
|> Enum.reduce(0, fn {x, y}, accumulator -> | |
accumulator + (x * y) | |
end) | |
end | |
def magnitude(a) do | |
Enum.reduce(a, 0, fn x, accumulator -> accumulator + (x ** 2) end) | |
|> :math.sqrt() | |
end | |
def cosine_similarity(a, b) when length(a) > 0 and length(b) > 0 and length(a) == length(b) do | |
all_zeros = (fn x -> Enum.all?(x, fn y -> y == 0 end) end) | |
unless all_zeros.(a) or all_zeros.(b) do | |
dot_product(a, b) / (magnitude(a) * magnitude(b)) | |
end | |
end | |
def jaccard_similarity(a, b) when length(a) > 0 and length(b) > 0 do | |
set_a = MapSet.new(a) | |
set_b = MapSet.new(b) | |
intersection = MapSet.intersection(set_a, set_b) | |
union = MapSet.union(set_a, set_b) | |
MapSet.size(intersection) / MapSet.size(union) | |
end | |
# We don't need to pattern match the function signature here because it's already done | |
# on the cosine similarity calculation | |
def cosine(a, b), do: 1 - cosine_similarity(a, b) | |
def jaccard(a, b), do: 1 - jaccard_similarity(a, b) | |
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
ExUnit.start() | |
defmodule AssertionTest do | |
use ExUnit.Case, async: true | |
describe "Distances.euclidean/2" do | |
test "It raises an error when the lists do not have the same number of elements" do | |
assert_raise FunctionClauseError, fn -> Distances.euclidean([1], [1, 2]) end | |
end | |
test "It raises an error when the lists are empty" do | |
assert_raise FunctionClauseError, fn -> Distances.euclidean([1], []) end | |
assert_raise FunctionClauseError, fn -> Distances.euclidean([], [1, 2]) end | |
assert_raise FunctionClauseError, fn -> Distances.euclidean([], []) end | |
end | |
test "It calculates the euclidean distance for known lists" do | |
assert Distances.euclidean([0, 1, 2], [1, 2, 0]) == :math.sqrt(6) | |
assert Distances.euclidean([-1, 0], [1, 0]) == :math.sqrt(4) | |
assert Distances.euclidean([0, 0, 0, 0], [1, 0, 1, 0]) == :math.sqrt(2) | |
assert Distances.euclidean([0, 0], [1, 0]) == 1 | |
end | |
end | |
describe "Distances.manhattan/2" do | |
test "It raises an error when the lists do not have the same number of elements" do | |
assert_raise FunctionClauseError, fn -> Distances.manhattan([1], [1, 2]) end | |
end | |
test "It raises an error when the lists are empty" do | |
assert_raise FunctionClauseError, fn -> Distances.manhattan([1], []) end | |
assert_raise FunctionClauseError, fn -> Distances.manhattan([], [1, 2]) end | |
assert_raise FunctionClauseError, fn -> Distances.manhattan([], []) end | |
end | |
test "It calculates the manhattan distance for known lists" do | |
assert Distances.manhattan([0, 1, 2], [1, 2, 0]) == 4 | |
assert Distances.manhattan([-1, 0], [1, 0]) == 2 | |
assert Distances.manhattan([0, 0, 0, 0], [1, 0, 1, 0]) == 2 | |
assert Distances.manhattan([0, 0], [1, 0]) == 1 | |
end | |
end | |
describe "Distances.chebyshev/2" do | |
test "It raises an error when the lists do not have the same number of elements" do | |
assert_raise FunctionClauseError, fn -> Distances.chebyshev([1], [1, 2]) end | |
end | |
test "It raises an error when the lists are empty" do | |
assert_raise FunctionClauseError, fn -> Distances.chebyshev([1], []) end | |
assert_raise FunctionClauseError, fn -> Distances.chebyshev([], [1, 2]) end | |
assert_raise FunctionClauseError, fn -> Distances.chebyshev([], []) end | |
end | |
test "It calculates the chebyshev distance for known lists" do | |
assert Distances.chebyshev([0, 1, 2], [1, 2, 0]) == 2 | |
assert Distances.chebyshev([-1, 0], [1, 0]) == 2 | |
assert Distances.chebyshev([0, 0, 0, 0], [1, 0, 1, 0]) == 1 | |
assert Distances.chebyshev([0, 0], [1, 0]) == 1 | |
end | |
end | |
describe "Distances.minkowski/3" do | |
test "It raises an error when the lists do not have the same number of elements" do | |
assert_raise FunctionClauseError, fn -> Distances.minkowski([1], [1, 2], 1) end | |
assert_raise FunctionClauseError, fn -> Distances.minkowski([1, 2], [1], 2) end | |
assert_raise FunctionClauseError, fn -> Distances.minkowski([1, 2], [1, 2, 3], 3) end | |
end | |
test "It raises an error when the lists are empty" do | |
assert_raise FunctionClauseError, fn -> Distances.minkowski([1], [], 1) end | |
assert_raise FunctionClauseError, fn -> Distances.minkowski([], [1, 2], 2) end | |
assert_raise FunctionClauseError, fn -> Distances.minkowski([], [], 3) end | |
end | |
test "It calculates the manhattan distance when p is 1" do | |
assert Distances.minkowski([0, 1, 2], [1, 2, 0], 1) == 4 | |
assert Distances.minkowski([-1, 0], [1, 0], 1) == 2 | |
assert Distances.minkowski([0, 0, 0, 0], [1, 0, 1, 0], 1) == 2 | |
assert Distances.minkowski([0, 0], [1, 0], 1) == 1 | |
end | |
test "It calculates the euclidean distance when p is 2" do | |
assert Distances.minkowski([0, 1, 2], [1, 2, 0], 2) == :math.sqrt(6) | |
assert Distances.minkowski([-1, 0], [1, 0], 2) == :math.sqrt(4) | |
assert Distances.minkowski([0, 0, 0, 0], [1, 0, 1, 0], 2) == :math.sqrt(2) | |
assert Distances.minkowski([0, 0], [1, 0], 2) == 1 | |
end | |
test "It calculates the minkowski distance when p >=3" do | |
assert Distances.minkowski([0, 1, 2], [1, 2, 0], 3) == 2.154434690031884 | |
assert Distances.minkowski([-1, 0], [1, 0], 4) == 2.0 | |
assert Distances.minkowski([0, 0, 0, 0], [1, 0, 1, 0], 5) == 1.148698354997035 | |
assert Distances.minkowski([0, 0], [1, 0], 6) == 1 | |
end | |
test "It aproximates to the chebyshev distance when p is large enough" do | |
# Try this with a library like Decimal https://hexdocs.pm/decimal/Decimal.html | |
left = | |
Distances.minkowski([0, 1, 2, 3], [4, 5, 6, 7], 510) | |
|> Float.round() | |
right = | |
Distances.chebyshev([0, 1, 2, 3], [4, 5, 6, 7]) | |
assert left == right | |
end | |
end | |
describe "Distances.hamming/2" do | |
test "It raises an error when the lists do not have the same number of elements" do | |
assert_raise FunctionClauseError, fn -> Distances.hamming([1], [1, 2]) end | |
end | |
test "It raises an error when the lists are empty" do | |
assert_raise FunctionClauseError, fn -> Distances.hamming([1], []) end | |
assert_raise FunctionClauseError, fn -> Distances.hamming([], [1, 2]) end | |
assert_raise FunctionClauseError, fn -> Distances.hamming([], []) end | |
end | |
test "It calculates the hamming distance for known lists" do | |
assert Distances.hamming([0, 1, 2], [1, 2, 0]) == 3 | |
assert Distances.hamming([-1, 0], [1, 0]) == 1 | |
assert Distances.hamming([0, 0, 0, 0], [1, 0, 1, 0]) == 2 | |
assert Distances.hamming([0, 0], [1, 0]) == 1 | |
end | |
end | |
describe "Distances.cosine_similarity/2" do | |
test "It raises an error when the lists do not have the same number of elements" do | |
assert_raise FunctionClauseError, fn -> Distances.cosine_similarity([1], [1, 2]) end | |
end | |
test "It raises an error when the lists are empty" do | |
assert_raise FunctionClauseError, fn -> Distances.cosine_similarity([1], []) end | |
assert_raise FunctionClauseError, fn -> Distances.cosine_similarity([], [1, 2]) end | |
assert_raise FunctionClauseError, fn -> Distances.cosine_similarity([], []) end | |
end | |
test "It calculates the cosine_similarity for known lists" do | |
assert Distances.cosine_similarity([0, 1, 2], [1, 2, 0]) == 0.3999999999999999 | |
assert Distances.cosine_similarity([-1, 0], [1, 0]) == -1.0 | |
assert Distances.cosine_similarity([1, 0, 0, 0], [1, 0, 1, 0]) == 1/:math.sqrt(2) | |
assert Distances.cosine_similarity([0, 1], [1, 0]) == 0 | |
end | |
test "It returns nil when any of the specified lists is a zero vector" do | |
refute Distances.cosine_similarity([-1, 0], [0, 0]) | |
refute Distances.cosine_similarity([0, 0], [1, 0]) | |
refute Distances.cosine_similarity([0, 0], [0, 0]) | |
end | |
end | |
describe "Distances.cosine/2" do | |
test "It raises an error when the lists do not have the same number of elements" do | |
assert_raise FunctionClauseError, fn -> Distances.cosine([1], [1, 2]) end | |
end | |
test "It raises an error when the lists are empty" do | |
assert_raise FunctionClauseError, fn -> Distances.cosine([1], []) end | |
assert_raise FunctionClauseError, fn -> Distances.cosine([], [1, 2]) end | |
assert_raise FunctionClauseError, fn -> Distances.cosine([], []) end | |
end | |
test "It calculates the cosine distance for known lists" do | |
assert Distances.cosine([0, 1, 2], [1, 2, 0]) |> Float.round(1) == 0.6 | |
assert Distances.cosine([-1, 0], [1, 0]) == 2 | |
assert Distances.cosine([1, 0, 0, 0], [1, 0, 1, 0]) == 1 - (1/:math.sqrt(2)) | |
assert Distances.cosine([1, 2, -3], [3, 6, -9]) == 0 | |
end | |
end | |
describe "Distances.jaccard_similarity/2" do | |
test "It raises an error when the lists are empty" do | |
assert_raise FunctionClauseError, fn -> Distances.jaccard_similarity([1], []) end | |
assert_raise FunctionClauseError, fn -> Distances.jaccard_similarity([], [1, 2]) end | |
assert_raise FunctionClauseError, fn -> Distances.jaccard_similarity([], []) end | |
end | |
test "It calculates the jaccard_similarity for known lists" do | |
assert Distances.jaccard_similarity([0, 1, 2], [1, 2, 0]) == 1 | |
assert Distances.jaccard_similarity([-1, 0], [1, 0]) |> Float.round(4) == 0.3333 | |
assert Distances.jaccard_similarity([0, 0, 0, 0], [1, 0, 1, 0]) == 0.5 | |
assert Distances.jaccard_similarity([0, 0], [1, 0]) == 0.5 | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
You can read a bit about my journey writing those two files here: https://estebanz.co/posts/distance-metrics-in-elixir/ it's in spanish.
All functions were gathered from this blogpost: https://www.vector-logic.com/blog/posts/common-distance-metrics-implemented-in-ruby