-
-
Save muthu32/83ad3f8f11f9c6fbf31c1b1b0e18801a to your computer and use it in GitHub Desktop.
MySQL Levenshtein distance algorithm
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- --------------------------------------------------------------------------- | |
-- Levenshtein distance | |
-- from the Artful Common Queries page | |
-- http://www.artfulsoftware.com/infotree/qrytip.php?id=552 | |
-- --------------------------------------------------------------------------- | |
-- The Levenshtein distance between two strings is the minimum number of | |
-- operations needed to transform one string into the other, where an operation | |
-- may be insertion, deletion or substitution of one character. | |
CREATE FUNCTION levenshtein( s1 VARCHAR(255), s2 VARCHAR(255) ) | |
RETURNS INT | |
DETERMINISTIC | |
BEGIN | |
DECLARE s1_len, s2_len, i, j, c, c_temp, cost INT; | |
DECLARE s1_char CHAR; | |
-- max strlen=255 | |
DECLARE cv0, cv1 VARBINARY(256); | |
SET s1_len = CHAR_LENGTH(s1), s2_len = CHAR_LENGTH(s2), cv1 = 0x00, j = 1, i = 1, c = 0; | |
IF s1 = s2 THEN | |
RETURN 0; | |
ELSEIF s1_len = 0 THEN | |
RETURN s2_len; | |
ELSEIF s2_len = 0 THEN | |
RETURN s1_len; | |
ELSE | |
WHILE j <= s2_len DO | |
SET cv1 = CONCAT(cv1, UNHEX(HEX(j))), j = j + 1; | |
END WHILE; | |
WHILE i <= s1_len DO | |
SET s1_char = SUBSTRING(s1, i, 1), c = i, cv0 = UNHEX(HEX(i)), j = 1; | |
WHILE j <= s2_len DO | |
SET c = c + 1; | |
IF s1_char = SUBSTRING(s2, j, 1) THEN | |
SET cost = 0; ELSE SET cost = 1; | |
END IF; | |
SET c_temp = CONV(HEX(SUBSTRING(cv1, j, 1)), 16, 10) + cost; | |
IF c > c_temp THEN SET c = c_temp; END IF; | |
SET c_temp = CONV(HEX(SUBSTRING(cv1, j+1, 1)), 16, 10) + 1; | |
IF c > c_temp THEN | |
SET c = c_temp; | |
END IF; | |
SET cv0 = CONCAT(cv0, UNHEX(HEX(c))), j = j + 1; | |
END WHILE; | |
SET cv1 = cv0, i = i + 1; | |
END WHILE; | |
END IF; | |
RETURN c; | |
END; | |
-- Helper function: | |
CREATE FUNCTION levenshtein_ratio( s1 VARCHAR(255), s2 VARCHAR(255) ) | |
RETURNS INT | |
DETERMINISTIC | |
BEGIN | |
DECLARE s1_len, s2_len, max_len INT; | |
SET s1_len = LENGTH(s1), s2_len = LENGTH(s2); | |
IF s1_len > s2_len THEN | |
SET max_len = s1_len; | |
ELSE | |
SET max_len = s2_len; | |
END IF; | |
RETURN ROUND((1 - LEVENSHTEIN(s1, s2) / max_len) * 100); | |
END; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment