Skip to content

Instantly share code, notes, and snippets.

@atronah
Last active September 11, 2018 13:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save atronah/e8de8ffbe22b9fabb4d65387884fae0c to your computer and use it in GitHub Desktop.
Save atronah/e8de8ffbe22b9fabb4d65387884fae0c to your computer and use it in GitHub Desktop.
MySQL function for calculating Domerau-Levenshtein distance
CREATE FUNCTION `getDamerauLevenshtein`(s1 VARCHAR(128) CHARSET utf8, s2 VARCHAR(128) CHARSET utf8) RETURNS int(11)
DETERMINISTIC
COMMENT 'Returns Domerau-Levenshtein distance, i.e. minimal number of char operations (cut, paste, replace, transposition) for two string'
BEGIN
DECLARE resultDistance INT(11);
DECLARE currRow INT; -- current row of matrix (0 .. LENGTH(s1))
DECLARE currColumn INT; -- current column of matrix (0 .. LENGTH(s2))
DECLARE s1Length INT; -- first\left\s1 string length
DECLARE s2Length INT; -- second\right\s2 string length
DECLARE replaceDistance INT; -- current distance for replace operation
DECLARE insertDistance INT; -- current distance for insert operation
DECLARE deleteDistance INT; -- current distance for delete operation
DECLARE tmpString VARCHAR(128); -- for swapping values of string
DECLARE tmpLength VARCHAR(128); -- for swapping values of string length
DECLARE currDistsList VARCHAR(128); -- previous row of matrix in string format (column values separated by comma)
DECLARE prevDistsList VARCHaR(128); -- current row of matrix (column values separated by comma)
-- for equivalent strings number of operations is zero
IF s1 LIKE s2 THEN
RETURN 0;
END IF;
SET s1Length = CHAR_LENGTH(s1);
SET s2Length = CHAR_LENGTH(s2);
-- for empty string number of operations is equal number of characters other string
IF s1Length = 0 THEN
RETURN s2Length;
END IF;
-- for empty string number of operations is equal number of characters other string
IF s2Length = 0 THEN
RETURN s1Length;
END IF;
-- optimization
IF s1Length > s2Length THEN
SET tmpString = s1;
SET s1 = s2;
SET s2 = tmpString;
SET tmpLength = s1Length;
SET s1Length = s2Length;
SET s2Length = tmpLength;
END IF;
SET currColumn = 0;
-- first row of distance value matrix contain number from 0 to length second string
WHILE (currColumn <= s2Length) DO
SET prevDistsList = CONCAT_WS(',', prevDistsList, currColumn);
SET currColumn = currColumn + 1;
END WHILE;
SET currRow = 1;
-- for reference:
-- "SUBSTRING_INDEX(SUBSTRING_INDEX(prevDistsList, ',', (currColumn + 1))"
-- equivalent
-- "prevDistsList[currColumn]" in python (type(prevDistsList) == list)
-- currColumn + 1 because start index for SUBSTRING_INDEX is 1 (but 0 as in standart list).
WHILE (currRow <= s1Length) DO
SET currDistsList = concat_ws(',', currRow);
SET currColumn = 1;
WHILE (currColumn <= s2Length) DO
-- if s1[row] = s2[column] then do not increment the counter operations (D[row, column] = D[row - 1, column -1])
-- or if transposition detected
IF SUBSTRING(s1, currRow, 1) = SUBSTRING(s2, currColumn, 1)
OR (SUBSTRING(s1, currRow - 1, 1) = SUBSTRING(s2, currColumn, 1)
AND SUBSTRING(s1, currRow, 1) = SUBSTRING(s2, currColumn - 1, 1)) THEN
SET currDistsList = concat_ws(',', currDistsList, SUBSTRING_INDEX(SUBSTRING_INDEX(prevDistsList, ',', (currColumn + 1) - 1), ',', -1));
ELSE
-- previous distance for "replace" operation in D[row - 1, column -1]
SET replaceDistance = SUBSTRING_INDEX(SUBSTRING_INDEX(prevDistsList, ',', (currColumn + 1) - 1), ',', -1) + 1;
-- previous distance for "insert" operation in D[row, column -1]
SET insertDistance = SUBSTRING_INDEX(SUBSTRING_INDEX(currDistsList, ',', (currColumn + 1) - 1), ',', -1) + 1;
-- previous distance for "delete" operation in D[row - 1, column]
SET deleteDistance = SUBSTRING_INDEX(SUBSTRING_INDEX(prevDistsList, ',', (currColumn + 1)), ',', -1) + 1;
SET currDistsList = concat_ws(',', currDistsList, LEAST(replaceDistance, insertDistance, deleteDistance));
END IF;
SET currColumn = currColumn + 1;
END WHILE;
SET currRow = currRow + 1;
SET prevDistsList = currDistsList;
END WHILE;
SET resultDistance = SUBSTRING_INDEX(SUBSTRING_INDEX(currDistsList, ',', s2Length + 1), ',', -1);
RETURN resultDistance;
END
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment