Skip to content

Instantly share code, notes, and snippets.

@chaudum
Last active December 10, 2015 22:58
Show Gist options
  • Save chaudum/4505916 to your computer and use it in GitHub Desktop.
Save chaudum/4505916 to your computer and use it in GitHub Desktop.
The higher the Jaro–Winkler distance for two strings is, the more similar the strings are. The Jaro–Winkler distance metric is designed and best suited for short strings such as person names. The score is normalized such that 0 equates to no similarity and 1 is an exact match.
import math
## JARO-WINKLER-DISTANCE
## Ported from http://www.iugrina.com/files/JaroWinkler/JaroWinkler.phps
def get_common_chars(str1, str2, allowed_distance):
str1_len = len(str1)
str2_len = len(str2)
tmp_str2 = [str2[x] for x in xrange(len(str2))]
common_chars = []
for i in xrange(str1_len):
no_match = True
range_min = int(max(0, i-allowed_distance))
range_max = int(min(i+allowed_distance+1, str2_len))
for j in xrange(range_min, range_max):
if no_match and tmp_str2[j] == str1[i]:
no_match = False
common_chars.append(str1[i])
tmp_str2[j] = ''
res = "".join(common_chars)
return res
def jaro(str1, str2):
str1_len = len(str1)
str2_len = len(str2)
## theoretical distance
distance = math.floor(min(str1_len, str2_len) / 2.0)
## get common characters
commons1 = get_common_chars(str1, str2, distance)
commons2 = get_common_chars(str2, str1, distance)
commons1_len = len(commons1)
commons2_len = len(commons2)
if commons1_len == 0 or commons2_len == 0:
return 0
## calculate transposition
transpositions = 0
upper_bound = min(commons1_len, commons2_len)
for i in xrange(upper_bound):
if commons1[i] != commons2[i]:
transpositions += 1
transpositions /= 2.0
## return jaro distance
return (commons1_len/str1_len + commons2_len/str2_len + (commons1_len - transpositions)/commons1_len) / 3.0
def get_prefix_length(str1, str2, MIN_PREFIX_LENGTH=4):
n = min(MIN_PREFIX_LENGTH, len(str1), len(str2))
for i in xrange(n):
if str1[i] != str2[i]:
## return index of first occurrence of different characters
return i
return n
def jaro_winkler_distance(str1, str2, PREFIX_SCALE=0.1):
distance = jaro(str1, str2)
prefix_len = get_prefix_length(str1, str2)
return distance + prefix_len * PREFIX_SCALE * (1.0 - distance)
## Usage: python jaro_winkler_distance.py
if __name__ == "__main__":
str1 = raw_input('Sequence 1: ')
str2 = raw_input('Sequence 2: ')
d = jaro_winkler_distance(str1, str2)
print "Jaro-Winkler distance: %f" % d
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment