Skip to content

Instantly share code, notes, and snippets.

@mklca
Created October 8, 2020 16:52
Show Gist options
  • Save mklca/8fb5d19dc0c8aae9b9f5441127fbe7af to your computer and use it in GitHub Desktop.
Save mklca/8fb5d19dc0c8aae9b9f5441127fbe7af to your computer and use it in GitHub Desktop.
Jaro and Jaro-Winkler String Distance Metrics
double jaro_string_similarity(boost::string_ref a, boost::string_ref b)
{
using std::cbegin;
using std::cend;
using std::distance;
if(a.empty()) {
return b.empty() ? 1.0 : 0.0;
}
const std::size_t as = a.size();
const std::size_t bs = b.size();
// The bandwidth to consider for matches
const ptrdiff_t width = std::max(as, bs)/2 - 1;
// These vectors match the sizes of their corresponding strings
std::vector<bool> a_matches(as, false);
std::vector<bool> b_matches(bs, false);
std::size_t matches = 0;
// Count matches
using string_iterator = boost::string_ref::const_iterator;
for(string_iterator ap = cbegin(a); ap != cend(a); ++ap)
{
const std::ptrdiff_t a_offset = distance(cbegin(a), ap);
assert(a_offset >= 0);
assert(static_cast<std::size_t>(a_offset) < a_matches.size());
// Avoid undefined behavior by not incrementing past the end of string b
const string_iterator b_start = std::next(cbegin(b),
std::min(static_cast<std::size_t>(std::max(0l, a_offset - width)), bs));
const string_iterator b_end = std::next(cbegin(b),
std::min(static_cast<std::size_t>(a_offset + width + 1), bs));
for(string_iterator bp = b_start; bp != b_end; ++bp)
{
const std::ptrdiff_t b_offset = distance(cbegin(b), bp);
assert(static_cast<std::size_t>(b_offset) < b_matches.size());
if(!b_matches[b_offset] && *ap == *bp) {
a_matches[a_offset] = true;
b_matches[b_offset] = true;
++matches;
break;
}
}
}
if(matches == 0) {
return 0.0;
}
// Count transpositions
std::size_t transposes = 0;
using match_iterator = std::vector<bool>::const_iterator;
match_iterator bmp = cbegin(b_matches);
for(match_iterator amp = cbegin(a_matches); amp != cend(a_matches); ++amp)
{
// Advance amp until until a match is encountered
if(!*amp) {
continue;
}
const std::ptrdiff_t a_offset = distance(cbegin(a_matches), amp);
// Similarly for b
while(bmp != cend(b_matches) && !*bmp) {
++bmp;
}
const std::ptrdiff_t b_offset = distance(cbegin(b_matches), bmp);
const string_iterator a_pos = std::next(cbegin(a), a_offset);
const string_iterator b_pos = std::next(cbegin(b), b_offset);
if(*a_pos != *b_pos) {
++transposes;
}
if(bmp != cend(b_matches)) {
++bmp;
}
}
// Compute the distance
const double term_a = static_cast<double>(matches)/as;
const double term_b = static_cast<double>(matches)/bs;
const double term_t = (static_cast<double>(matches) - static_cast<double>(transposes)/2.0)/matches;
return (term_a + term_b + term_t)/3.0;
}
double jaro_winkler_string_similarity(std::size_t max_prefix, double scale, boost::string_ref a, boost::string_ref b)
{
const std::size_t as = a.size();
const std::size_t bs = b.size();
// Count up to max_prefix matches at the beginning of each string
const std::size_t prefix_bound = std::min({max_prefix, as, bs});
std::size_t prefix_matches = 0;
for(unsigned int offset = 0; offset < prefix_bound; ++offset)
{
if(a[offset] == b[offset]) {
++prefix_matches;
}
}
const double jaro_value = jaro_string_similarity(a, b);
return jaro_value + static_cast<double>(prefix_matches)*scale*(1.0 - jaro_value);
}
using namespace std;
BOOST_AUTO_TEST_CASE(jaro_both_empty_match)
{
const double jaro_value = klustrcore::jaro_string_similarity("", "");
BOOST_CHECK_EQUAL(jaro_value, 1.0);
}
BOOST_AUTO_TEST_CASE(jaro_left_empty_nomatch)
{
const double jaro_value = klustrcore::jaro_string_similarity("", "quo");
BOOST_CHECK_EQUAL(jaro_value, 0.0);
}
BOOST_AUTO_TEST_CASE(jaro_right_empty_nomatch)
{
const double jaro_value = klustrcore::jaro_string_similarity("quux", "");
BOOST_CHECK_EQUAL(jaro_value, 0.0);
}
BOOST_AUTO_TEST_CASE(jaro_sample_1, * boost::unit_test::tolerance(1e-2))
{
const double jaro_value = klustrcore::jaro_string_similarity("DWAYNE", "DUANE");
BOOST_TEST(jaro_value == 0.822);
}
BOOST_AUTO_TEST_CASE(jaro_sample_2, * boost::unit_test::tolerance(1e-2))
{
const double jaro_value = klustrcore::jaro_string_similarity("MARTHA", "MARHTA");
BOOST_TEST(jaro_value == 0.944);
}
BOOST_AUTO_TEST_CASE(jaro_sample_3, * boost::unit_test::tolerance(1e-2))
{
const double jaro_value = klustrcore::jaro_string_similarity("DIXON", "DICKSONX");
BOOST_TEST(jaro_value == 0.767);
}
BOOST_AUTO_TEST_CASE(jaro_sample_4, * boost::unit_test::tolerance(1e-2))
{
const double jaro_value = klustrcore::jaro_string_similarity("JELLYFISH", "SMELLYFISH");
BOOST_TEST(jaro_value == 0.896);
}
BOOST_AUTO_TEST_CASE(jaro_sample_5, * boost::unit_test::tolerance(1e-2))
{
const double jaro_value = klustrcore::jaro_string_similarity("CAT", "BIRD");
BOOST_TEST(jaro_value == 0);
}
BOOST_AUTO_TEST_CASE(jaro_winkler_sample_1, * boost::unit_test::tolerance(1e-2))
{
const double jw_value = klustrcore::jaro_winkler_string_similarity("DWAYNE", "DUANE");
BOOST_TEST(jw_value == 0.858);
}
BOOST_AUTO_TEST_CASE(jaro_winkler_sample_2, * boost::unit_test::tolerance(1e-2))
{
const double jw_value = klustrcore::jaro_winkler_string_similarity("MARTHA", "MARHTA");
BOOST_TEST(jw_value == 0.961);
}
BOOST_AUTO_TEST_CASE(jaro_winkler_sample_3, * boost::unit_test::tolerance(1e-2))
{
const double jw_value = klustrcore::jaro_winkler_string_similarity("DIXON", "DICKSONX");
BOOST_TEST(jw_value == 0.813);
}
BOOST_AUTO_TEST_CASE(jaro_winkler_sample_4, * boost::unit_test::tolerance(1e-2))
{
const double jw_value = klustrcore::jaro_winkler_string_similarity("JELLYFISH", "SMELLYFISH");
BOOST_TEST(jw_value == 0.906);
}
BOOST_AUTO_TEST_CASE(jaro_winkler_sample_5, * boost::unit_test::tolerance(1e-2))
{
const double jw_value = klustrcore::jaro_winkler_string_similarity("CAT", "BIRD");
BOOST_TEST(jw_value == 0);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment