Skip to content

Instantly share code, notes, and snippets.

@sahib
Created May 6, 2012 12:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sahib/2622023 to your computer and use it in GitHub Desktop.
Save sahib/2622023 to your computer and use it in GitHub Desktop.
Better implementation to get the lv-distance of two utf8-strings
#include <glib.h>
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <locale.h>
///////////////////////////////
// Old version
gsize levenshtein_plain_strcmp(const gchar * s, const gchar * t)
{
int n = (s) ? strlen(s)+1 : 0;
int m = (t) ? strlen(t)+1 : 0;
// Nothing to compute really..
if (n==0) return m;
if (m==0) return n;
// String matrix
int d[n][m];
int i,j;
// Init first row|column to 0...n|m
for (i=0; i<n; i++) d[i][0] = i;
for (j=0; j<m; j++) d[0][j] = j;
for (i=1; i<n; i++)
{
// Current char in string s
char cats = s[i-1];
for (j=1; j<m; j++)
{
// Do -1 only once
int jm1 = j-1,
im1 = i-1;
// a = above cell, b = left cell, c = left above celli
int a = d[im1][j] + 1,
b = d[i][jm1] + 1,
c = d[im1][jm1] + (t[jm1] != cats);
// Now compute the minimum of a,b,c and set MIN(a,b,c) to cell d[i][j]
d[i][j] = (a < b) ? MIN(a,c) : MIN(b,c);
}
}
// The result is stored in the very right down cell
return d[n-1][m-1];
}
///////////////////////////////
// Utf8 aware levenshtein
gsize levenshtein_strcmp(const gchar * s, const gchar * t)
{
int n = (s) ? g_utf8_strlen(s,-1)+1 : 0;
int m = (t) ? g_utf8_strlen(t,-1)+1 : 0;
// NOTE: Be sure to call g_utf8_validate(), might fail otherwise
// It's advisable to call g_utf8_normalize() too.
// Nothing to compute really..
if (n < 2) return m;
if (m < 2) return n;
// String matrix
int d[n][m];
int i,j;
// Init first row|column to 0...n|m
for (i=0; i<n; i++) d[i][0] = i;
for (j=0; j<m; j++) d[0][j] = j;
for (i=1; i<n; i++)
{
// Current char in string s
gunichar cats = g_utf8_get_char(g_utf8_offset_to_pointer(s,i-1));
for (j=1; j<m; j++)
{
// Do -1 only once
int jm1 = j-1,
im1 = i-1;
gunichar tats = g_utf8_get_char(g_utf8_offset_to_pointer(t,jm1));
// a = above cell, b = left cell, c = left above celli
int a = d[im1][j] + 1,
b = d[i][jm1] + 1,
c = d[im1][jm1] + (tats != cats);
// Now compute the minimum of a,b,c and set MIN(a,b,c) to cell d[i][j]
d[i][j] = (a < b) ? MIN(a,c) : MIN(b,c);
}
}
// The result is stored in the very right down cell
return d[n-1][m-1];
}
///////////////////////////////
// A utf8 aware levenshtein that normalizes ambigious codepoints
// and validates input-data
gsize levenshtein_safe_strcmp(const gchar * s, const gchar * t)
{
gsize rc = 0;
if(g_utf8_validate(s,-1,NULL) == FALSE ||
g_utf8_validate(t,-1,NULL) == FALSE)
return rc;
gchar * s_norm = g_utf8_normalize(s,-1,G_NORMALIZE_ALL_COMPOSE);
gchar * t_norm = g_utf8_normalize(t,-1,G_NORMALIZE_ALL_COMPOSE);
rc = levenshtein_strcmp(s_norm,t_norm);
g_free(s_norm);
g_free(t_norm);
return rc;
}
///////////////////////////////
int main(int argc, char const *argv[])
{
if(argc < 3)
return EXIT_FAILURE;
setlocale(LC_ALL,"");
printf("Unicode lv_dist: %lu\n",levenshtein_strcmp(argv[1],argv[2]));
printf("Safeutf lv_dist: %lu\n",levenshtein_safe_strcmp(argv[1],argv[2]));
printf("'Plain' lv_dist: %lu\n",levenshtein_plain_strcmp(argv[1],argv[2]));
return EXIT_SUCCESS;
}
@andyholmes
Copy link

Hi, can I use this under the terms of LGPL-2.1-or-later?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment