Skip to content

Instantly share code, notes, and snippets.

@neesenk
Created December 25, 2011 14:30
Show Gist options
  • Save neesenk/1519359 to your computer and use it in GitHub Desktop.
Save neesenk/1519359 to your computer and use it in GitHub Desktop.
比较两个字符串的相似程度Jaro–Winkler距离
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
/**
* Jaro–Winkler distance
* http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance
*
* 比较两个字符串的相似程度
*/
#define MAX(a, b) ((a) < (b) ? (b) : (a))
#define MIN(a, b) ((a) < (b) ? (a) : (b))
#define GET(a, i) (a[i / 32] & (1UL << (i % 32)))
#define SET(a, i) (a[i / 32] |= (1UL << (i % 32)))
double jarowinkler_distance(const char *s1, const char *s2)
{
int l1 = strlen(s1), l2 = strlen(s2), p = 0, m = 0, t = 0, i = 0, j = 0, r;
uint32_t bits1[32] = { 0 }, bits2[32] = { 0 };
double ret = 0;
l1 = MIN(1024, l1), l2 = MIN(1024, l2); /* 如果字符串超出了1024, 只比较前1024个字符 */
r = MAX(0, MAX(l1, l2) / 2 - 1); /* 比较的范围 */
while (p < l1 && p < l2 && s1[p] == s2[p]) /* 前缀的长度 */
p++;
for (m = p, i = p; i < l1; i++) { /* 查找相同的字符个数 */
for (j = MAX(p, i - r); j < l2 && j <= i + r; j++) {
if (!GET(bits2, j) && s1[i] == s2[j]) {
SET(bits1, i), SET(bits2, j);
m++;
break;
}
}
}
for (i = p, j = p; i < l1 && j < l2; i++) { /* 查找相同字符错位的个数 */
if (!GET(bits1, i))
continue;
while (j < l2 && !GET(bits2, j))
j++;
if (j < l2 && s1[i] != s2[j])
t++;
j++;
}
if (m > 0) {
ret = ((double)m / l1 + (double)m / l2 + (double)(m - t / 2) / m) / 3;
ret += p * (1 - ret) / (double)MAX(l1, l2);
}
return ret;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment