estebanz01/distances.ex

## distances.ex
defmodule Distances do
  def euclidean(a, b) when length(a) > 0 and length(b) > 0 and length(a) == length(b) do
    # We zip both lists into one, then we reduce it to the euclidean calculation.
    # Enum.zip_reduce(a, b, 0, fn) <- as a shortcut.
    Stream.zip(a, b)
    |> Enum.reduce(0, fn {x, y}, accumulator ->
      accumulator + (x - y) ** 2
    end)
    |> :math.sqrt()
  end

  def manhattan(a, b) when length(a) > 0 and length(b) > 0 and length(a) == length(b) do
    Stream.zip(a, b)
    |> Enum.reduce(0, fn {x,y}, accumulator -> accumulator + abs(x - y) end)
  end

  def chebyshev(a, b) when length(a) > 0 and length(b) > 0 and length(a) == length(b) do
    Stream.zip(a, b)
    |> Stream.map(fn {x,y} -> abs(x - y) end)
    |> Enum.max()
  end

  def minkowski(a, b, p) when length(a) > 0 and length(b) > 0 and length(a) == length(b) do
    case p do
      1 -> manhattan(a, b)
      2 -> euclidean(a, b)
      _ ->
        Stream.zip(a, b)
        |> Enum.reduce(0, fn {x, y}, accumulator ->
            accumulator + (abs(x - y) ** p)
        end) |> (fn x -> x ** (1.0/p) end).()
    end
  end

  def hamming(a, b) when length(a) > 0 and length(b) > 0 and length(a) == length(b) do
    # Only count the ones that are different, which is the same as doing a summation on the signal output.
    Stream.zip(a,b) |> Enum.count(fn {x, y} -> x != y end)
  end

  def dot_product(a, b) do
    Stream.zip(a, b)
    |> Enum.reduce(0, fn {x, y}, accumulator ->
        accumulator + (x * y)
    end)
  end

  def magnitude(a) do
    Enum.reduce(a, 0, fn x, accumulator -> accumulator + (x ** 2) end)
    |> :math.sqrt()
  end

  def cosine_similarity(a, b) when length(a) > 0 and length(b) > 0 and length(a) == length(b) do
    all_zeros = (fn x -> Enum.all?(x, fn y -> y == 0 end) end)

    unless all_zeros.(a) or all_zeros.(b) do
      dot_product(a, b) / (magnitude(a) * magnitude(b))
    end
  end

  def jaccard_similarity(a, b) when length(a) > 0 and length(b) > 0 do
    set_a = MapSet.new(a)
    set_b = MapSet.new(b)
    intersection = MapSet.intersection(set_a, set_b)
    union = MapSet.union(set_a, set_b)

    MapSet.size(intersection) / MapSet.size(union)
  end

  # We don't need to pattern match the function signature here because it's already done
  # on the cosine similarity calculation
  def cosine(a, b), do: 1 - cosine_similarity(a, b)

  def jaccard(a, b), do: 1 - jaccard_similarity(a, b)
end

## distances_test.exs
ExUnit.start()

defmodule AssertionTest do
  use ExUnit.Case, async: true

  describe "Distances.euclidean/2" do
    test "It raises an error when the lists do not have the same number of elements" do
      assert_raise FunctionClauseError, fn -> Distances.euclidean([1], [1, 2]) end
    end

    test "It raises an error when the lists are empty" do
      assert_raise FunctionClauseError, fn -> Distances.euclidean([1], []) end
      assert_raise FunctionClauseError, fn -> Distances.euclidean([], [1, 2]) end
      assert_raise FunctionClauseError, fn -> Distances.euclidean([], []) end
    end

    test "It calculates the euclidean distance for known lists" do
      assert Distances.euclidean([0, 1, 2], [1, 2, 0]) == :math.sqrt(6)
      assert Distances.euclidean([-1, 0], [1, 0]) == :math.sqrt(4)
      assert Distances.euclidean([0, 0, 0, 0], [1, 0, 1, 0]) == :math.sqrt(2)
      assert Distances.euclidean([0, 0], [1, 0]) == 1
    end
  end

  describe "Distances.manhattan/2" do
    test "It raises an error when the lists do not have the same number of elements" do
      assert_raise FunctionClauseError, fn -> Distances.manhattan([1], [1, 2]) end
    end

    test "It raises an error when the lists are empty" do
      assert_raise FunctionClauseError, fn -> Distances.manhattan([1], []) end
      assert_raise FunctionClauseError, fn -> Distances.manhattan([], [1, 2]) end
      assert_raise FunctionClauseError, fn -> Distances.manhattan([], []) end
    end

    test "It calculates the manhattan distance for known lists" do
      assert Distances.manhattan([0, 1, 2], [1, 2, 0]) == 4
      assert Distances.manhattan([-1, 0], [1, 0]) == 2
      assert Distances.manhattan([0, 0, 0, 0], [1, 0, 1, 0]) == 2
      assert Distances.manhattan([0, 0], [1, 0]) == 1
    end
  end

  describe "Distances.chebyshev/2" do
    test "It raises an error when the lists do not have the same number of elements" do
      assert_raise FunctionClauseError, fn -> Distances.chebyshev([1], [1, 2]) end
    end

    test "It raises an error when the lists are empty" do
      assert_raise FunctionClauseError, fn -> Distances.chebyshev([1], []) end
      assert_raise FunctionClauseError, fn -> Distances.chebyshev([], [1, 2]) end
      assert_raise FunctionClauseError, fn -> Distances.chebyshev([], []) end
    end

    test "It calculates the chebyshev distance for known lists" do
      assert Distances.chebyshev([0, 1, 2], [1, 2, 0]) == 2
      assert Distances.chebyshev([-1, 0], [1, 0]) == 2
      assert Distances.chebyshev([0, 0, 0, 0], [1, 0, 1, 0]) == 1
      assert Distances.chebyshev([0, 0], [1, 0]) == 1
    end
  end

  describe "Distances.minkowski/3" do
    test "It raises an error when the lists do not have the same number of elements" do
      assert_raise FunctionClauseError, fn -> Distances.minkowski([1], [1, 2], 1) end
      assert_raise FunctionClauseError, fn -> Distances.minkowski([1, 2], [1], 2) end
      assert_raise FunctionClauseError, fn -> Distances.minkowski([1, 2], [1, 2, 3], 3) end
    end

    test "It raises an error when the lists are empty" do
      assert_raise FunctionClauseError, fn -> Distances.minkowski([1], [], 1) end
      assert_raise FunctionClauseError, fn -> Distances.minkowski([], [1, 2], 2) end
      assert_raise FunctionClauseError, fn -> Distances.minkowski([], [], 3) end
    end

    test "It calculates the manhattan distance when p is 1" do
      assert Distances.minkowski([0, 1, 2], [1, 2, 0], 1) == 4
      assert Distances.minkowski([-1, 0], [1, 0], 1) == 2
      assert Distances.minkowski([0, 0, 0, 0], [1, 0, 1, 0], 1) == 2
      assert Distances.minkowski([0, 0], [1, 0], 1) == 1
    end

    test "It calculates the euclidean distance when p is 2" do
      assert Distances.minkowski([0, 1, 2], [1, 2, 0], 2) == :math.sqrt(6)
      assert Distances.minkowski([-1, 0], [1, 0], 2) == :math.sqrt(4)
      assert Distances.minkowski([0, 0, 0, 0], [1, 0, 1, 0], 2) == :math.sqrt(2)
      assert Distances.minkowski([0, 0], [1, 0], 2) == 1
    end

    test "It calculates the minkowski distance when p >=3" do
      assert Distances.minkowski([0, 1, 2], [1, 2, 0], 3) == 2.154434690031884
      assert Distances.minkowski([-1, 0], [1, 0], 4) == 2.0
      assert Distances.minkowski([0, 0, 0, 0], [1, 0, 1, 0], 5) == 1.148698354997035
      assert Distances.minkowski([0, 0], [1, 0], 6) == 1
    end

    test "It aproximates to the chebyshev distance when p is large enough" do
      # Try this with a library like Decimal https://hexdocs.pm/decimal/Decimal.html
      left =
        Distances.minkowski([0, 1, 2, 3], [4, 5, 6, 7], 510)
        |> Float.round()

      right =
        Distances.chebyshev([0, 1, 2, 3], [4, 5, 6, 7])

      assert left == right
    end
  end

  describe "Distances.hamming/2" do
    test "It raises an error when the lists do not have the same number of elements" do
      assert_raise FunctionClauseError, fn -> Distances.hamming([1], [1, 2]) end
    end

    test "It raises an error when the lists are empty" do
      assert_raise FunctionClauseError, fn -> Distances.hamming([1], []) end
      assert_raise FunctionClauseError, fn -> Distances.hamming([], [1, 2]) end
      assert_raise FunctionClauseError, fn -> Distances.hamming([], []) end
    end

    test "It calculates the hamming distance for known lists" do
      assert Distances.hamming([0, 1, 2], [1, 2, 0]) == 3
      assert Distances.hamming([-1, 0], [1, 0]) == 1
      assert Distances.hamming([0, 0, 0, 0], [1, 0, 1, 0]) == 2
      assert Distances.hamming([0, 0], [1, 0]) == 1
    end
  end

  describe "Distances.cosine_similarity/2" do
    test "It raises an error when the lists do not have the same number of elements" do
      assert_raise FunctionClauseError, fn -> Distances.cosine_similarity([1], [1, 2]) end
    end

    test "It raises an error when the lists are empty" do
      assert_raise FunctionClauseError, fn -> Distances.cosine_similarity([1], []) end
      assert_raise FunctionClauseError, fn -> Distances.cosine_similarity([], [1, 2]) end
      assert_raise FunctionClauseError, fn -> Distances.cosine_similarity([], []) end
    end

    test "It calculates the cosine_similarity for known lists" do
      assert Distances.cosine_similarity([0, 1, 2], [1, 2, 0]) == 0.3999999999999999
      assert Distances.cosine_similarity([-1, 0], [1, 0]) == -1.0
      assert Distances.cosine_similarity([1, 0, 0, 0], [1, 0, 1, 0]) == 1/:math.sqrt(2)
      assert Distances.cosine_similarity([0, 1], [1, 0]) == 0
    end

    test "It returns nil when any of the specified lists is a zero vector" do
      refute Distances.cosine_similarity([-1, 0], [0, 0])
      refute Distances.cosine_similarity([0, 0], [1, 0])
      refute Distances.cosine_similarity([0, 0], [0, 0])
    end
  end

  describe "Distances.cosine/2" do
    test "It raises an error when the lists do not have the same number of elements" do
      assert_raise FunctionClauseError, fn -> Distances.cosine([1], [1, 2]) end
    end

    test "It raises an error when the lists are empty" do
      assert_raise FunctionClauseError, fn -> Distances.cosine([1], []) end
      assert_raise FunctionClauseError, fn -> Distances.cosine([], [1, 2]) end
      assert_raise FunctionClauseError, fn -> Distances.cosine([], []) end
    end

    test "It calculates the cosine distance for known lists" do
      assert Distances.cosine([0, 1, 2], [1, 2, 0]) |> Float.round(1) == 0.6
      assert Distances.cosine([-1, 0], [1, 0]) == 2
      assert Distances.cosine([1, 0, 0, 0], [1, 0, 1, 0]) == 1 - (1/:math.sqrt(2))
      assert Distances.cosine([1, 2, -3], [3, 6, -9]) == 0
    end
  end

  describe "Distances.jaccard_similarity/2" do
    test "It raises an error when the lists are empty" do
      assert_raise FunctionClauseError, fn -> Distances.jaccard_similarity([1], []) end
      assert_raise FunctionClauseError, fn -> Distances.jaccard_similarity([], [1, 2]) end
      assert_raise FunctionClauseError, fn -> Distances.jaccard_similarity([], []) end
    end

    test "It calculates the jaccard_similarity for known lists" do
      assert Distances.jaccard_similarity([0, 1, 2], [1, 2, 0]) == 1
      assert Distances.jaccard_similarity([-1, 0], [1, 0]) |> Float.round(4) == 0.3333
      assert Distances.jaccard_similarity([0, 0, 0, 0], [1, 0, 1, 0]) == 0.5
      assert Distances.jaccard_similarity([0, 0], [1, 0]) == 0.5
    end
  end
end
	defmodule Distances do
	def euclidean(a, b) when length(a) > 0 and length(b) > 0 and length(a) == length(b) do
	# We zip both lists into one, then we reduce it to the euclidean calculation.
	# Enum.zip_reduce(a, b, 0, fn) <- as a shortcut.
	Stream.zip(a, b)
	\|> Enum.reduce(0, fn {x, y}, accumulator ->
	accumulator + (x - y) ** 2
	end)
	\|> :math.sqrt()
	end

	def manhattan(a, b) when length(a) > 0 and length(b) > 0 and length(a) == length(b) do
	Stream.zip(a, b)
	\|> Enum.reduce(0, fn {x,y}, accumulator -> accumulator + abs(x - y) end)
	end

	def chebyshev(a, b) when length(a) > 0 and length(b) > 0 and length(a) == length(b) do
	Stream.zip(a, b)
	\|> Stream.map(fn {x,y} -> abs(x - y) end)
	\|> Enum.max()
	end

	def minkowski(a, b, p) when length(a) > 0 and length(b) > 0 and length(a) == length(b) do
	case p do
	1 -> manhattan(a, b)
	2 -> euclidean(a, b)
	_ ->
	Stream.zip(a, b)
	\|> Enum.reduce(0, fn {x, y}, accumulator ->
	accumulator + (abs(x - y) ** p)
	end) \|> (fn x -> x ** (1.0/p) end).()
	end
	end

	def hamming(a, b) when length(a) > 0 and length(b) > 0 and length(a) == length(b) do
	# Only count the ones that are different, which is the same as doing a summation on the signal output.
	Stream.zip(a,b) \|> Enum.count(fn {x, y} -> x != y end)
	end

	def dot_product(a, b) do
	Stream.zip(a, b)
	\|> Enum.reduce(0, fn {x, y}, accumulator ->
	accumulator + (x * y)
	end)
	end

	def magnitude(a) do
	Enum.reduce(a, 0, fn x, accumulator -> accumulator + (x ** 2) end)
	\|> :math.sqrt()
	end

	def cosine_similarity(a, b) when length(a) > 0 and length(b) > 0 and length(a) == length(b) do
	all_zeros = (fn x -> Enum.all?(x, fn y -> y == 0 end) end)

	unless all_zeros.(a) or all_zeros.(b) do
	dot_product(a, b) / (magnitude(a) * magnitude(b))
	end
	end

	def jaccard_similarity(a, b) when length(a) > 0 and length(b) > 0 do
	set_a = MapSet.new(a)
	set_b = MapSet.new(b)
	intersection = MapSet.intersection(set_a, set_b)
	union = MapSet.union(set_a, set_b)

	MapSet.size(intersection) / MapSet.size(union)
	end

	# We don't need to pattern match the function signature here because it's already done
	# on the cosine similarity calculation
	def cosine(a, b), do: 1 - cosine_similarity(a, b)

	def jaccard(a, b), do: 1 - jaccard_similarity(a, b)
	end
	ExUnit.start()

	defmodule AssertionTest do
	use ExUnit.Case, async: true

	describe "Distances.euclidean/2" do
	test "It raises an error when the lists do not have the same number of elements" do
	assert_raise FunctionClauseError, fn -> Distances.euclidean([1], [1, 2]) end
	end

	test "It raises an error when the lists are empty" do
	assert_raise FunctionClauseError, fn -> Distances.euclidean([1], []) end
	assert_raise FunctionClauseError, fn -> Distances.euclidean([], [1, 2]) end
	assert_raise FunctionClauseError, fn -> Distances.euclidean([], []) end
	end

	test "It calculates the euclidean distance for known lists" do
	assert Distances.euclidean([0, 1, 2], [1, 2, 0]) == :math.sqrt(6)
	assert Distances.euclidean([-1, 0], [1, 0]) == :math.sqrt(4)
	assert Distances.euclidean([0, 0, 0, 0], [1, 0, 1, 0]) == :math.sqrt(2)
	assert Distances.euclidean([0, 0], [1, 0]) == 1
	end
	end

	describe "Distances.manhattan/2" do
	test "It raises an error when the lists do not have the same number of elements" do
	assert_raise FunctionClauseError, fn -> Distances.manhattan([1], [1, 2]) end
	end

	test "It raises an error when the lists are empty" do
	assert_raise FunctionClauseError, fn -> Distances.manhattan([1], []) end
	assert_raise FunctionClauseError, fn -> Distances.manhattan([], [1, 2]) end
	assert_raise FunctionClauseError, fn -> Distances.manhattan([], []) end
	end

	test "It calculates the manhattan distance for known lists" do
	assert Distances.manhattan([0, 1, 2], [1, 2, 0]) == 4
	assert Distances.manhattan([-1, 0], [1, 0]) == 2
	assert Distances.manhattan([0, 0, 0, 0], [1, 0, 1, 0]) == 2
	assert Distances.manhattan([0, 0], [1, 0]) == 1
	end
	end

	describe "Distances.chebyshev/2" do
	test "It raises an error when the lists do not have the same number of elements" do
	assert_raise FunctionClauseError, fn -> Distances.chebyshev([1], [1, 2]) end
	end

	test "It raises an error when the lists are empty" do
	assert_raise FunctionClauseError, fn -> Distances.chebyshev([1], []) end
	assert_raise FunctionClauseError, fn -> Distances.chebyshev([], [1, 2]) end
	assert_raise FunctionClauseError, fn -> Distances.chebyshev([], []) end
	end

	test "It calculates the chebyshev distance for known lists" do
	assert Distances.chebyshev([0, 1, 2], [1, 2, 0]) == 2
	assert Distances.chebyshev([-1, 0], [1, 0]) == 2
	assert Distances.chebyshev([0, 0, 0, 0], [1, 0, 1, 0]) == 1
	assert Distances.chebyshev([0, 0], [1, 0]) == 1
	end
	end

	describe "Distances.minkowski/3" do
	test "It raises an error when the lists do not have the same number of elements" do
	assert_raise FunctionClauseError, fn -> Distances.minkowski([1], [1, 2], 1) end
	assert_raise FunctionClauseError, fn -> Distances.minkowski([1, 2], [1], 2) end
	assert_raise FunctionClauseError, fn -> Distances.minkowski([1, 2], [1, 2, 3], 3) end
	end

	test "It raises an error when the lists are empty" do
	assert_raise FunctionClauseError, fn -> Distances.minkowski([1], [], 1) end
	assert_raise FunctionClauseError, fn -> Distances.minkowski([], [1, 2], 2) end
	assert_raise FunctionClauseError, fn -> Distances.minkowski([], [], 3) end
	end

	test "It calculates the manhattan distance when p is 1" do
	assert Distances.minkowski([0, 1, 2], [1, 2, 0], 1) == 4
	assert Distances.minkowski([-1, 0], [1, 0], 1) == 2
	assert Distances.minkowski([0, 0, 0, 0], [1, 0, 1, 0], 1) == 2
	assert Distances.minkowski([0, 0], [1, 0], 1) == 1
	end

	test "It calculates the euclidean distance when p is 2" do
	assert Distances.minkowski([0, 1, 2], [1, 2, 0], 2) == :math.sqrt(6)
	assert Distances.minkowski([-1, 0], [1, 0], 2) == :math.sqrt(4)
	assert Distances.minkowski([0, 0, 0, 0], [1, 0, 1, 0], 2) == :math.sqrt(2)
	assert Distances.minkowski([0, 0], [1, 0], 2) == 1
	end

	test "It calculates the minkowski distance when p >=3" do
	assert Distances.minkowski([0, 1, 2], [1, 2, 0], 3) == 2.154434690031884
	assert Distances.minkowski([-1, 0], [1, 0], 4) == 2.0
	assert Distances.minkowski([0, 0, 0, 0], [1, 0, 1, 0], 5) == 1.148698354997035
	assert Distances.minkowski([0, 0], [1, 0], 6) == 1
	end

	test "It aproximates to the chebyshev distance when p is large enough" do
	# Try this with a library like Decimal https://hexdocs.pm/decimal/Decimal.html
	left =
	Distances.minkowski([0, 1, 2, 3], [4, 5, 6, 7], 510)
	\|> Float.round()

	right =
	Distances.chebyshev([0, 1, 2, 3], [4, 5, 6, 7])

	assert left == right
	end
	end

	describe "Distances.hamming/2" do
	test "It raises an error when the lists do not have the same number of elements" do
	assert_raise FunctionClauseError, fn -> Distances.hamming([1], [1, 2]) end
	end

	test "It raises an error when the lists are empty" do
	assert_raise FunctionClauseError, fn -> Distances.hamming([1], []) end
	assert_raise FunctionClauseError, fn -> Distances.hamming([], [1, 2]) end
	assert_raise FunctionClauseError, fn -> Distances.hamming([], []) end
	end

	test "It calculates the hamming distance for known lists" do
	assert Distances.hamming([0, 1, 2], [1, 2, 0]) == 3
	assert Distances.hamming([-1, 0], [1, 0]) == 1
	assert Distances.hamming([0, 0, 0, 0], [1, 0, 1, 0]) == 2
	assert Distances.hamming([0, 0], [1, 0]) == 1
	end
	end

	describe "Distances.cosine_similarity/2" do
	test "It raises an error when the lists do not have the same number of elements" do
	assert_raise FunctionClauseError, fn -> Distances.cosine_similarity([1], [1, 2]) end
	end

	test "It raises an error when the lists are empty" do
	assert_raise FunctionClauseError, fn -> Distances.cosine_similarity([1], []) end
	assert_raise FunctionClauseError, fn -> Distances.cosine_similarity([], [1, 2]) end
	assert_raise FunctionClauseError, fn -> Distances.cosine_similarity([], []) end
	end

	test "It calculates the cosine_similarity for known lists" do
	assert Distances.cosine_similarity([0, 1, 2], [1, 2, 0]) == 0.3999999999999999
	assert Distances.cosine_similarity([-1, 0], [1, 0]) == -1.0
	assert Distances.cosine_similarity([1, 0, 0, 0], [1, 0, 1, 0]) == 1/:math.sqrt(2)
	assert Distances.cosine_similarity([0, 1], [1, 0]) == 0
	end

	test "It returns nil when any of the specified lists is a zero vector" do
	refute Distances.cosine_similarity([-1, 0], [0, 0])
	refute Distances.cosine_similarity([0, 0], [1, 0])
	refute Distances.cosine_similarity([0, 0], [0, 0])
	end
	end

	describe "Distances.cosine/2" do
	test "It raises an error when the lists do not have the same number of elements" do
	assert_raise FunctionClauseError, fn -> Distances.cosine([1], [1, 2]) end
	end

	test "It raises an error when the lists are empty" do
	assert_raise FunctionClauseError, fn -> Distances.cosine([1], []) end
	assert_raise FunctionClauseError, fn -> Distances.cosine([], [1, 2]) end
	assert_raise FunctionClauseError, fn -> Distances.cosine([], []) end
	end

	test "It calculates the cosine distance for known lists" do
	assert Distances.cosine([0, 1, 2], [1, 2, 0]) \|> Float.round(1) == 0.6
	assert Distances.cosine([-1, 0], [1, 0]) == 2
	assert Distances.cosine([1, 0, 0, 0], [1, 0, 1, 0]) == 1 - (1/:math.sqrt(2))
	assert Distances.cosine([1, 2, -3], [3, 6, -9]) == 0
	end
	end

	describe "Distances.jaccard_similarity/2" do
	test "It raises an error when the lists are empty" do
	assert_raise FunctionClauseError, fn -> Distances.jaccard_similarity([1], []) end
	assert_raise FunctionClauseError, fn -> Distances.jaccard_similarity([], [1, 2]) end
	assert_raise FunctionClauseError, fn -> Distances.jaccard_similarity([], []) end
	end

	test "It calculates the jaccard_similarity for known lists" do
	assert Distances.jaccard_similarity([0, 1, 2], [1, 2, 0]) == 1
	assert Distances.jaccard_similarity([-1, 0], [1, 0]) \|> Float.round(4) == 0.3333
	assert Distances.jaccard_similarity([0, 0, 0, 0], [1, 0, 1, 0]) == 0.5
	assert Distances.jaccard_similarity([0, 0], [1, 0]) == 0.5
	end
	end
	end