Skip to content

Instantly share code, notes, and snippets.

@Rostepher
Last active October 5, 2018 22:05
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Rostepher/818fd0ce41d0c18154ae to your computer and use it in GitHub Desktop.
Save Rostepher/818fd0ce41d0c18154ae to your computer and use it in GitHub Desktop.
A working implementation of the Damerau-Levenshtein distance in C.
// based on the implementation found at:
// https://stackoverflow.com/questions/10727174/damerau-levenshtein-distance-edit-distance-with-transposition-c-implementation
//
// still deciphering the alogrithm from the cryptic vairable names, hopefully thie implementation below
// is easier to read and understand.
#include <assert.h>
#include <stdlib.h>
#include <string.h>
#define EQ(a, b) ((a) == (b))
#define MIN(a, b) ((a) < (b)) ? (a) : (b)
#define MIN4(a, b, c, d) MIN(MIN(a, b), MIN(c, d))
static void print_matrix(unsigned **matrix, size_t m_rows, size_t m_cols);
/**
* Calculates and returns the Damerau-Levenshtein distance of two non NULL
* strings.
*
* @param str1 first non NULL string
* @param str2 second non NULL string
* @param alpha_size size of the alphabet
*
* @returns Damerau-Levenshtein distance of str1 and str2
*/
unsigned damerau_levenshtein(const char *str1,
const char *str2,
const unsigned alpha_size)
{
// strings cannot be NULL
assert(str1 != NULL);
assert(str2 != NULL);
// calculate size of strings
size_t str1_len = strlen(str1);
size_t str2_len = strlen(str2);
// handle cases where one or both strings are empty
if (str1_len == 0)
return str2_len;
if (str2_len == 0)
return str1_len;
const unsigned INFINITY = str1_len + str2_len;
unsigned row, col;
// create "dictionary"
unsigned *dict = calloc(alpha_size, sizeof(unsigned));
size_t m_rows = str1_len + 2; // matrix rows
size_t m_cols = str2_len + 2; // matrix cols
// matrix to hold computed values
unsigned **matrix = malloc(m_rows * sizeof(unsigned *));
for (unsigned i = 0; i < m_rows; i++)
matrix[i] = calloc(m_cols, sizeof(unsigned));
print_matrix(matrix, m_rows, m_cols);
// set all the starting values and add all characters to the dict
matrix[0][0] = INFINITY;
for (row = 1; row < m_rows; row++) {
matrix[row][0] = INFINITY;
matrix[row][1] = row - 1;
}
for (col = 1; col < m_cols; col++) {
matrix[0][col] = INFINITY;
matrix[1][col] = col - 1;
}
print_matrix(matrix, m_rows, m_cols);
unsigned db; // no idea what this is
unsigned i, k; // also no idea what these are
unsigned cost;
// fill in the matrix
for (row = 1; row <= str1_len; row++) {
db = 0;
for (col = 1; col <= str2_len; col++) {
i = dict[(unsigned) str2[col - 1]];
k = db;
cost = EQ(str1[row - 1], str2[col - 1]) ? 0 : 1;
if (cost == 0)
db = col;
matrix[row + 1][col + 1] = MIN4(
matrix[row][col] + cost,
matrix[row + 1][col] + 1,
matrix[row][col + 1] + 1,
matrix[i][k] + (row - i - 1) + (col - k - 1) + 1
);
print_matrix(matrix, m_rows, m_cols);
}
dict[(unsigned) str1[row - 1]] = row;
}
unsigned result = matrix[m_rows - 1][m_cols - 1];
// free dict
free(dict);
// free matrix
for (unsigned i = 0; i < m_rows; i++)
free(matrix[i]);
free(matrix);
return result;
}
#include <stdio.h>
static void print_matrix(unsigned **matrix, size_t m_rows, size_t m_cols)
{
unsigned i, k;
for (i = 0; i < m_rows; i++) {
for (k = 0; k < m_cols; k++) {
printf("%02d ", matrix[i][k]);
}
printf("\n");
}
printf("\n");
}
#define ALPHABET_SIZE 256
int main(int argc, char **argv)
{
assert(argc > 2);
char *str1 = argv[1];
char *str2 = argv[2];
printf("DL(\"%s\", \"%s\") = %d\n", str1, str2,
damerau_levenshtein(str1, str2, ALPHABET_SIZE));
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment