Skip to content

Instantly share code, notes, and snippets.

@the-mikedavis
Last active November 8, 2023 19:28
Show Gist options
  • Save the-mikedavis/db39019721fab873980df3560345a93b to your computer and use it in GitHub Desktop.
Save the-mikedavis/db39019721fab873980df3560345a93b to your computer and use it in GitHub Desktop.
-module(jaro).
-export([distance/2]).
%% This is an Erlang translation of Elixir's `String.jaro_distance/2'.
%% Upstream: https://github.com/elixir-lang/elixir/blob/v1.15.7/lib/elixir/lib/string.ex#L2784-L2879.
%% Jaro Distance is relatively simple - only transpositions of grapheme
%% clusters are allowed which is much simpler than other edit distance
%% measurements like Damerau–Levenshtein (insertion, deletion, substitution
%% _and_ transposition).
%% Transpositions are pairs of characters that exist in both input strings but
%% have different positions in the strings.
-spec distance(BinA, BinB) -> Distance when
BinA :: binary(),
BinB :: binary(),
%% [+0.0, 1.0]
Distance :: float().
distance(Bin, Bin) ->
1.0;
distance(_Bin, <<"">>) ->
+0.0;
distance(<<"">>, _Bin) ->
+0.0;
distance(Bin1, Bin2) when is_binary(Bin1) andalso is_binary(Bin2) ->
{Graphemes1, Len1} = graphemes_and_length(Bin1),
{Graphemes2, Len2} = graphemes_and_length(Bin2),
case match(Graphemes1, Len1, Graphemes2, Len2) of
{0, _Trans} ->
+0.0;
{Common, Transpositions} ->
(Common / Len1 +
Common / Len2 +
(Common - Transpositions) / Common) / 3
end.
match(Graphemes1, Len1, Graphemes2, Len2) ->
%% Iterate through the shorter grapheme cluster list.
case Len1 < Len2 of
true ->
match(Graphemes1, Graphemes2, Len1 div 2 - 1);
false ->
match(Graphemes2, Graphemes1, Len2 div 2 - 1)
end.
match(Graphemes1, Graphemes2, Limit) ->
match(Graphemes1, Graphemes2, {0, Limit}, {0, 0, -1}, 0).
match([Grapheme | Rest], Graphemes, Range, State, Idx) ->
{Graphemes1, State1} = submatch(Grapheme, Graphemes, Range, State, Idx),
case Range of
{Limit, Limit} ->
match(Rest, tl(Graphemes1), Range, State1, Idx + 1);
{Pre, Limit} ->
match(Rest, Graphemes1, {Pre + 1, Limit}, State1, Idx + 1)
end;
match([], _, _, {Common, Transpositions, _}, _) ->
{Common, Transpositions}.
submatch(Grapheme, Graphemes, {Pre, _} = Range, State, Idx) ->
case detect(Grapheme, Graphemes, Range) of
undefined ->
{Graphemes, State};
{SubIdx, Graphemes1} ->
{Graphemes1, proceed(State, Idx - Pre + SubIdx)}
end.
detect(Grapheme, Graphemes, {Pre, Limit}) ->
detect(Grapheme, Graphemes, Pre + 1 + Limit, 0, []).
detect(_Grapheme, _Graphemes, 0, _Idx, _Acc) ->
undefined;
detect(_Grapheme, [], _Limit, _Idx, _Acc) ->
undefined;
detect(Grapheme, [Grapheme | Rest], _Limit, Idx, Acc) ->
{Idx, lists:reverse(Acc, ['=:=' | Rest])};
detect(Grapheme, [Other | Rest], Limit, Idx, Acc) ->
detect(Grapheme, Rest, Limit - 1, Idx + 1, [Other | Acc]).
proceed({Common, Transpositions, Former}, Current) ->
case Current < Former of
true ->
{Common + 1, Transpositions + 1, Current};
false ->
{Common + 1, Transpositions, Current}
end.
-spec graphemes_and_length(Bin) -> {Graphemes, Length} when
Bin :: binary(),
Graphemes :: [binary()],
Length :: non_neg_integer().
%% @doc Separates the grapheme clusters of the binary and counts grapheme
%% clusters.
%%
%% @private
%% Upstream: https://github.com/elixir-lang/elixir/blob/v1.15.7/lib/elixir/lib/string.ex#L2950-L2964
graphemes_and_length(Bin) ->
graphemes_and_length(Bin, [], 0).
graphemes_and_length(Bin, Acc, Length) ->
case unicode_util:gc(Bin) of
[Gc | Rest] ->
graphemes_and_length(Rest, [Gc | Acc], Length + 1);
[] ->
{lists:reverse(Acc), Length};
{error, <<Byte, Rest/bitstring>>} ->
graphemes_and_length(Rest, [<<Byte>> | Acc], Length + 1)
end.
-ifdef(TEST).
-include_lib("eunit/include/eunit.hrl").
edge_cases_test() ->
?assertEqual(1.0, jaro:distance(<<"foo">>, <<"foo">>)),
?assertEqual(+0.0, jaro:distance(<<"foo">>, <<"">>)),
?assertEqual(+0.0, jaro:distance(<<"">>, <<"foo">>)).
nothing_in_common_test() ->
?assertEqual(+0.0, jaro:distance(<<"foo">>, <<"bar">>)).
wikipedia_example_test() ->
Trunc = fun(Float, Precision) ->
Scalar = math:pow(10, Precision),
trunc(Float * Scalar) / Scalar
end,
%% See <https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance#Jaro_similarity>.
%% FAREMVIEL and FARMVILLE have a distance of 0.88 (truncated).
?assertEqual(
0.88,
Trunc(jaro:distance(<<"faremveil">>, <<"farmville">>), 2)).
unicode_test() ->
%% The pirate flag emoji is a combination of the black flag and skull emoji
%% graphemes joined by the zero-width joiner code-point. It's a single
%% grapheme <em>cluster</em> though, and we should be segmenting the input
%% binaries on grapheme clusters, so the code-points in common should not
%% be considered a transposition: they shoud not increase the distance
%% score.
%% So it should be the same score as if we replaced the pirate flag with
%% some other single grapheme cluster and the black flag and skull with
%% two different grapheme clusters.
?assertEqual(
jaro:distance(<<"🏴‍☠️"/utf8>>, <<"🏴☠️"/utf8>>),
jaro:distance(<<"a">>, <<"bc">>)).
invalid_unicode_test() ->
%% Without the `/utf8' bit string modifier, the following is not valid
%% unicode and `unicode_util:gc/1' will correctly recognize the invalid
%% byte sequences.
%% The invalid byte will be the same between both code-point lists. The
%% skull emoji will be mistaken as two grapheme clusters while the black
%% flag and zero-width joiner will be considered one cluster each, so I've
%% represented this below as the flag flag as "a", zero-width joiner as
%% space, and skull as "bc".
?assertEqual(
jaro:distance(<<"🏴‍☠️">>, <<"🏴☠️">>),
jaro:distance(<<"a bc">>, <<"abc">>)).
-endif.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment