Created
January 31, 2011 09:55
-
-
Save jaspervdj/803839 to your computer and use it in GitHub Desktop.
String diff using higher order functions for character distances
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/lua | |
-------------------------------------------------------------------------------- | |
-- Distance implementation (abstract) -- | |
-------------------------------------------------------------------------------- | |
-- Simple minimum function for 3 arguments | |
local function min3(a, b, c) | |
local x = a < b and a or b | |
return x < c and x or c | |
end | |
-- Distance from str1 to str2 | |
function get_distance(replace_cost, insert_cost, delete_cost, str1, str2) | |
local cache, len1, len2 = {}, string.len(str1), string.len(str2) | |
-- Initialize cache | |
local cache = {} | |
for i = 0, len1 do cache[i] = {} end | |
cache[0][0] = 0 | |
-- Initial values | |
for i = 1, len1 do | |
cache[i][0] = cache[i - 1][0] + delete_cost(string.sub(str1, i, i)) | |
end | |
for j = 1, len2 do | |
cache[0][j] = cache[0][j - 1] + insert_cost(string.sub(str2, j, j)) | |
end | |
-- Build cache | |
for i = 1, len1 do | |
for j = 1, len2 do | |
-- Find the two characters | |
local c1 = string.sub(str1, i, i) | |
local c2 = string.sub(str2, j, j) | |
cache[i][j] = min3( | |
-- Replace | |
cache[i - 1][j - 1] + replace_cost(c1, c2), | |
-- Insert | |
cache[i - 1][j] + insert_cost(c1), | |
-- Delete | |
cache[i][j - 1] + delete_cost(c2) | |
) | |
end | |
end | |
-- Take result | |
return cache[len1][len2] | |
end | |
-------------------------------------------------------------------------------- | |
-- Concrete functions, example: vowels vs. consonants -- | |
-------------------------------------------------------------------------------- | |
-- Test if a character is a vowel | |
function is_vowel(c) | |
if string.find("aeiou", c) then return true else return false end | |
end | |
-- Vowels are hard to replace by consonants, and vice versa | |
function replace_cost(c1, c2) | |
-- Special case: characters are equal: no cost | |
if c1 == c2 then return 0 end | |
if is_vowel(c1) == is_vowel(c2) then | |
return 1 | |
else | |
return 10000 | |
end | |
end | |
-- Consonants are harder to delete and insert | |
function cheap_vowels(c) | |
if is_vowel(c) then return 1 else return 3 end | |
end | |
-- Curried get_distance | |
function get_distance_vowels(str1, str2) | |
return get_distance(replace_cost, cheap_vowels, cheap_vowels, str1, str2) | |
end | |
-------------------------------------------------------------------------------- | |
-- Tests -- | |
-------------------------------------------------------------------------------- | |
function test(str1, str2) | |
print(str1 .. " -> " .. str2 .. ": " .. get_distance_vowels(str1, str2)) | |
end | |
test("jasper", "japser") | |
test("jasper", "jasr") | |
test("jasper", "josper") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment