Created
July 4, 2022 10:56
-
-
Save sfaut/5cf74b3fe6f17d7dea72143b7c35fc0e to your computer and use it in GitHub Desktop.
SQL function calculating the Levenshtein distance between 2 strings
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- Author : sfaut <https://github.com/sfaut> | |
-- Creation date : 2022-07-04 | |
-- Tested with MySQL 8.0.28 (@@sql_mode='ANSI,TRADITIONAL') | |
CREATE DEFINER="sfaut"@"%" FUNCTION "levenshtein_distance"(s1 TINYTEXT, s2 TINYTEXT) | |
RETURNS TINYINT UNSIGNED | |
DETERMINISTIC | |
COMMENT 'Calculate the Levenshtein distance' | |
BEGIN | |
DECLARE s1_length TINYINT UNSIGNED DEFAULT CHAR_LENGTH(s1); | |
DECLARE s2_length TINYINT UNSIGNED DEFAULT CHAR_LENGTH(s2); | |
DECLARE previous_distances, current_distances JSON DEFAULT JSON_ARRAY(); | |
DECLARE p1, p2 TINYINT UNSIGNED; | |
DECLARE c1 VARCHAR(1); -- Not CHAR(1) because of auto-trim white space | |
DECLARE cost TINYINT UNSIGNED; | |
-- Remarkable cases handling | |
-- OK because all costs are at 1 | |
IF s1 = s2 THEN | |
RETURN 0; | |
ELSEIF s1 = '' THEN | |
RETURN s2_length; | |
ELSEIF s2 = '' THEN | |
RETURN s1_length; | |
ELSEIF s1_length > s2_length AND POSITION(s2 IN s1) > 0 THEN | |
RETURN s1_length - s2_length; | |
ELSEIF s2_length > s1_length AND POSITION(s1 IN s2) > 0 THEN | |
RETURN s2_length - s1_length; | |
END IF; | |
SET p2 = 0; | |
WHILE p2 <= s2_length DO | |
SET previous_distances = JSON_ARRAY_APPEND(previous_distances, '$', p2); | |
SET p2 = p2 + 1; | |
END WHILE; | |
SET p1 = 1; | |
WHILE p1 <= s1_length DO | |
SET current_distances = JSON_ARRAY(p1); | |
SET c1 = SUBSTRING(s1 FROM p1 FOR 1); | |
SET p2 = 1; | |
WHILE p2 <= s2_length DO | |
SET cost = LEAST( -- Cheap cost retrieval | |
-- Removal cost | |
JSON_EXTRACT(previous_distances, CONCAT('$[', p2, ']')) + 1, | |
-- Insertion cost | |
JSON_EXTRACT(current_distances, CONCAT('$[', p2 - 1, ']')) + 1, | |
-- Substitution cost | |
JSON_EXTRACT(previous_distances, CONCAT('$[', p2 - 1, ']')) | |
+ IF(c1 = SUBSTRING(s2 FROM p2 FOR 1), 0, 1) | |
); | |
SET current_distances = JSON_ARRAY_APPEND(current_distances, '$', cost); | |
SET p2 = p2 + 1; | |
END WHILE; | |
SET previous_distances = current_distances; | |
SET p1 = p1 + 1; | |
END WHILE; | |
-- The result is the last cost | |
RETURN JSON_EXTRACT(current_distances, '$[last]'); | |
END |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment