Skip to content

Instantly share code, notes, and snippets.

@estebanz01
Last active July 30, 2023 15:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save estebanz01/62e4b1348086e24f5fe6c686a19c53f5 to your computer and use it in GitHub Desktop.
Save estebanz01/62e4b1348086e24f5fe6c686a19c53f5 to your computer and use it in GitHub Desktop.
Some distance metrics implemented in plain Elixir, with tests. Blog in spanish: https://estebanz.co/posts/distance-metrics-in-elixir/
defmodule Distances do
def euclidean(a, b) when length(a) > 0 and length(b) > 0 and length(a) == length(b) do
# We zip both lists into one, then we reduce it to the euclidean calculation.
# Enum.zip_reduce(a, b, 0, fn) <- as a shortcut.
Stream.zip(a, b)
|> Enum.reduce(0, fn {x, y}, accumulator ->
accumulator + (x - y) ** 2
end)
|> :math.sqrt()
end
def manhattan(a, b) when length(a) > 0 and length(b) > 0 and length(a) == length(b) do
Stream.zip(a, b)
|> Enum.reduce(0, fn {x,y}, accumulator -> accumulator + abs(x - y) end)
end
def chebyshev(a, b) when length(a) > 0 and length(b) > 0 and length(a) == length(b) do
Stream.zip(a, b)
|> Stream.map(fn {x,y} -> abs(x - y) end)
|> Enum.max()
end
def minkowski(a, b, p) when length(a) > 0 and length(b) > 0 and length(a) == length(b) do
case p do
1 -> manhattan(a, b)
2 -> euclidean(a, b)
_ ->
Stream.zip(a, b)
|> Enum.reduce(0, fn {x, y}, accumulator ->
accumulator + (abs(x - y) ** p)
end) |> (fn x -> x ** (1.0/p) end).()
end
end
def hamming(a, b) when length(a) > 0 and length(b) > 0 and length(a) == length(b) do
# Only count the ones that are different, which is the same as doing a summation on the signal output.
Stream.zip(a,b) |> Enum.count(fn {x, y} -> x != y end)
end
def dot_product(a, b) do
Stream.zip(a, b)
|> Enum.reduce(0, fn {x, y}, accumulator ->
accumulator + (x * y)
end)
end
def magnitude(a) do
Enum.reduce(a, 0, fn x, accumulator -> accumulator + (x ** 2) end)
|> :math.sqrt()
end
def cosine_similarity(a, b) when length(a) > 0 and length(b) > 0 and length(a) == length(b) do
all_zeros = (fn x -> Enum.all?(x, fn y -> y == 0 end) end)
unless all_zeros.(a) or all_zeros.(b) do
dot_product(a, b) / (magnitude(a) * magnitude(b))
end
end
def jaccard_similarity(a, b) when length(a) > 0 and length(b) > 0 do
set_a = MapSet.new(a)
set_b = MapSet.new(b)
intersection = MapSet.intersection(set_a, set_b)
union = MapSet.union(set_a, set_b)
MapSet.size(intersection) / MapSet.size(union)
end
# We don't need to pattern match the function signature here because it's already done
# on the cosine similarity calculation
def cosine(a, b), do: 1 - cosine_similarity(a, b)
def jaccard(a, b), do: 1 - jaccard_similarity(a, b)
end
ExUnit.start()
defmodule AssertionTest do
use ExUnit.Case, async: true
describe "Distances.euclidean/2" do
test "It raises an error when the lists do not have the same number of elements" do
assert_raise FunctionClauseError, fn -> Distances.euclidean([1], [1, 2]) end
end
test "It raises an error when the lists are empty" do
assert_raise FunctionClauseError, fn -> Distances.euclidean([1], []) end
assert_raise FunctionClauseError, fn -> Distances.euclidean([], [1, 2]) end
assert_raise FunctionClauseError, fn -> Distances.euclidean([], []) end
end
test "It calculates the euclidean distance for known lists" do
assert Distances.euclidean([0, 1, 2], [1, 2, 0]) == :math.sqrt(6)
assert Distances.euclidean([-1, 0], [1, 0]) == :math.sqrt(4)
assert Distances.euclidean([0, 0, 0, 0], [1, 0, 1, 0]) == :math.sqrt(2)
assert Distances.euclidean([0, 0], [1, 0]) == 1
end
end
describe "Distances.manhattan/2" do
test "It raises an error when the lists do not have the same number of elements" do
assert_raise FunctionClauseError, fn -> Distances.manhattan([1], [1, 2]) end
end
test "It raises an error when the lists are empty" do
assert_raise FunctionClauseError, fn -> Distances.manhattan([1], []) end
assert_raise FunctionClauseError, fn -> Distances.manhattan([], [1, 2]) end
assert_raise FunctionClauseError, fn -> Distances.manhattan([], []) end
end
test "It calculates the manhattan distance for known lists" do
assert Distances.manhattan([0, 1, 2], [1, 2, 0]) == 4
assert Distances.manhattan([-1, 0], [1, 0]) == 2
assert Distances.manhattan([0, 0, 0, 0], [1, 0, 1, 0]) == 2
assert Distances.manhattan([0, 0], [1, 0]) == 1
end
end
describe "Distances.chebyshev/2" do
test "It raises an error when the lists do not have the same number of elements" do
assert_raise FunctionClauseError, fn -> Distances.chebyshev([1], [1, 2]) end
end
test "It raises an error when the lists are empty" do
assert_raise FunctionClauseError, fn -> Distances.chebyshev([1], []) end
assert_raise FunctionClauseError, fn -> Distances.chebyshev([], [1, 2]) end
assert_raise FunctionClauseError, fn -> Distances.chebyshev([], []) end
end
test "It calculates the chebyshev distance for known lists" do
assert Distances.chebyshev([0, 1, 2], [1, 2, 0]) == 2
assert Distances.chebyshev([-1, 0], [1, 0]) == 2
assert Distances.chebyshev([0, 0, 0, 0], [1, 0, 1, 0]) == 1
assert Distances.chebyshev([0, 0], [1, 0]) == 1
end
end
describe "Distances.minkowski/3" do
test "It raises an error when the lists do not have the same number of elements" do
assert_raise FunctionClauseError, fn -> Distances.minkowski([1], [1, 2], 1) end
assert_raise FunctionClauseError, fn -> Distances.minkowski([1, 2], [1], 2) end
assert_raise FunctionClauseError, fn -> Distances.minkowski([1, 2], [1, 2, 3], 3) end
end
test "It raises an error when the lists are empty" do
assert_raise FunctionClauseError, fn -> Distances.minkowski([1], [], 1) end
assert_raise FunctionClauseError, fn -> Distances.minkowski([], [1, 2], 2) end
assert_raise FunctionClauseError, fn -> Distances.minkowski([], [], 3) end
end
test "It calculates the manhattan distance when p is 1" do
assert Distances.minkowski([0, 1, 2], [1, 2, 0], 1) == 4
assert Distances.minkowski([-1, 0], [1, 0], 1) == 2
assert Distances.minkowski([0, 0, 0, 0], [1, 0, 1, 0], 1) == 2
assert Distances.minkowski([0, 0], [1, 0], 1) == 1
end
test "It calculates the euclidean distance when p is 2" do
assert Distances.minkowski([0, 1, 2], [1, 2, 0], 2) == :math.sqrt(6)
assert Distances.minkowski([-1, 0], [1, 0], 2) == :math.sqrt(4)
assert Distances.minkowski([0, 0, 0, 0], [1, 0, 1, 0], 2) == :math.sqrt(2)
assert Distances.minkowski([0, 0], [1, 0], 2) == 1
end
test "It calculates the minkowski distance when p >=3" do
assert Distances.minkowski([0, 1, 2], [1, 2, 0], 3) == 2.154434690031884
assert Distances.minkowski([-1, 0], [1, 0], 4) == 2.0
assert Distances.minkowski([0, 0, 0, 0], [1, 0, 1, 0], 5) == 1.148698354997035
assert Distances.minkowski([0, 0], [1, 0], 6) == 1
end
test "It aproximates to the chebyshev distance when p is large enough" do
# Try this with a library like Decimal https://hexdocs.pm/decimal/Decimal.html
left =
Distances.minkowski([0, 1, 2, 3], [4, 5, 6, 7], 510)
|> Float.round()
right =
Distances.chebyshev([0, 1, 2, 3], [4, 5, 6, 7])
assert left == right
end
end
describe "Distances.hamming/2" do
test "It raises an error when the lists do not have the same number of elements" do
assert_raise FunctionClauseError, fn -> Distances.hamming([1], [1, 2]) end
end
test "It raises an error when the lists are empty" do
assert_raise FunctionClauseError, fn -> Distances.hamming([1], []) end
assert_raise FunctionClauseError, fn -> Distances.hamming([], [1, 2]) end
assert_raise FunctionClauseError, fn -> Distances.hamming([], []) end
end
test "It calculates the hamming distance for known lists" do
assert Distances.hamming([0, 1, 2], [1, 2, 0]) == 3
assert Distances.hamming([-1, 0], [1, 0]) == 1
assert Distances.hamming([0, 0, 0, 0], [1, 0, 1, 0]) == 2
assert Distances.hamming([0, 0], [1, 0]) == 1
end
end
describe "Distances.cosine_similarity/2" do
test "It raises an error when the lists do not have the same number of elements" do
assert_raise FunctionClauseError, fn -> Distances.cosine_similarity([1], [1, 2]) end
end
test "It raises an error when the lists are empty" do
assert_raise FunctionClauseError, fn -> Distances.cosine_similarity([1], []) end
assert_raise FunctionClauseError, fn -> Distances.cosine_similarity([], [1, 2]) end
assert_raise FunctionClauseError, fn -> Distances.cosine_similarity([], []) end
end
test "It calculates the cosine_similarity for known lists" do
assert Distances.cosine_similarity([0, 1, 2], [1, 2, 0]) == 0.3999999999999999
assert Distances.cosine_similarity([-1, 0], [1, 0]) == -1.0
assert Distances.cosine_similarity([1, 0, 0, 0], [1, 0, 1, 0]) == 1/:math.sqrt(2)
assert Distances.cosine_similarity([0, 1], [1, 0]) == 0
end
test "It returns nil when any of the specified lists is a zero vector" do
refute Distances.cosine_similarity([-1, 0], [0, 0])
refute Distances.cosine_similarity([0, 0], [1, 0])
refute Distances.cosine_similarity([0, 0], [0, 0])
end
end
describe "Distances.cosine/2" do
test "It raises an error when the lists do not have the same number of elements" do
assert_raise FunctionClauseError, fn -> Distances.cosine([1], [1, 2]) end
end
test "It raises an error when the lists are empty" do
assert_raise FunctionClauseError, fn -> Distances.cosine([1], []) end
assert_raise FunctionClauseError, fn -> Distances.cosine([], [1, 2]) end
assert_raise FunctionClauseError, fn -> Distances.cosine([], []) end
end
test "It calculates the cosine distance for known lists" do
assert Distances.cosine([0, 1, 2], [1, 2, 0]) |> Float.round(1) == 0.6
assert Distances.cosine([-1, 0], [1, 0]) == 2
assert Distances.cosine([1, 0, 0, 0], [1, 0, 1, 0]) == 1 - (1/:math.sqrt(2))
assert Distances.cosine([1, 2, -3], [3, 6, -9]) == 0
end
end
describe "Distances.jaccard_similarity/2" do
test "It raises an error when the lists are empty" do
assert_raise FunctionClauseError, fn -> Distances.jaccard_similarity([1], []) end
assert_raise FunctionClauseError, fn -> Distances.jaccard_similarity([], [1, 2]) end
assert_raise FunctionClauseError, fn -> Distances.jaccard_similarity([], []) end
end
test "It calculates the jaccard_similarity for known lists" do
assert Distances.jaccard_similarity([0, 1, 2], [1, 2, 0]) == 1
assert Distances.jaccard_similarity([-1, 0], [1, 0]) |> Float.round(4) == 0.3333
assert Distances.jaccard_similarity([0, 0, 0, 0], [1, 0, 1, 0]) == 0.5
assert Distances.jaccard_similarity([0, 0], [1, 0]) == 0.5
end
end
end
@estebanz01
Copy link
Author

You can read a bit about my journey writing those two files here: https://estebanz.co/posts/distance-metrics-in-elixir/ it's in spanish.
All functions were gathered from this blogpost: https://www.vector-logic.com/blog/posts/common-distance-metrics-implemented-in-ruby

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment