Created
January 31, 2016 17:46
-
-
Save naoa/934da65118b7bbc99178 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <string.h> | |
#include <stdlib.h> | |
#include <groonga.h> | |
#include <time.h> | |
#include <limits.h> | |
typedef struct { | |
double score; | |
int n_subrecs; | |
int subrecs[1]; | |
} grn_rset_recinfo; | |
void | |
output_with_sort_by_score(grn_ctx *ctx, grn_obj *hash, grn_obj *table, | |
const char *search_key, int len, const char *path) | |
{ | |
uint32_t nkeys; | |
grn_table_sort_key *keys; | |
grn_obj *sorted; | |
FILE *fp; | |
if ((fp = fopen(path, "a")) == NULL) { | |
printf("file open error!!\n"); | |
return; | |
} | |
sorted = grn_table_create(ctx, NULL, 0, NULL, GRN_OBJ_TABLE_NO_KEY, NULL, hash); | |
if ((keys = grn_table_sort_key_from_str(ctx, "_score", strlen("_score"), hash, &nkeys))) { | |
grn_table_sort(ctx, hash, 0, -1, sorted, keys, nkeys); | |
grn_table_cursor *tc; | |
if ((tc = grn_table_cursor_open(ctx, sorted, NULL, 0, NULL, 0, 0, -1, GRN_CURSOR_BY_ID))) { | |
grn_id id; | |
grn_obj *column = grn_obj_column(ctx, sorted, | |
GRN_COLUMN_NAME_KEY, | |
GRN_COLUMN_NAME_KEY_LEN); | |
fprintf(fp, "%.*s\t%d\t", len, search_key, grn_table_size(ctx, hash)); | |
while ((id = grn_table_cursor_next(ctx, tc))) { | |
grn_id *sid, oid; | |
grn_rset_recinfo ri; | |
char key[GRN_TABLE_MAX_KEY_SIZE]; | |
unsigned int key_size; | |
grn_table_cursor_get_value(ctx, tc, (void **)&sid); | |
grn_hash_get_value(ctx, (grn_hash *)hash, *sid, (void *)&ri); | |
grn_hash_get_key(ctx, (grn_hash *)hash, *sid, (void *)&oid, sizeof(grn_id)); | |
key_size = grn_table_get_key(ctx, table, oid, key, GRN_TABLE_MAX_KEY_SIZE); | |
//fprintf(fp, "%d:", (int)ri.score); | |
//fprintf(fp, "%.*s ", key_size, key); | |
} | |
fprintf(fp, "\n"); | |
grn_obj_unlink(ctx, column); | |
grn_table_cursor_close(ctx, tc); | |
} | |
grn_table_sort_key_close(ctx, keys, nkeys); | |
} | |
fclose(fp); | |
} | |
#define DIST(ox,oy) (dists[((lx + 1) * (oy)) + (ox)]) | |
uint32_t | |
calc_edit_distance(grn_ctx *ctx, char *x_start, char *x_end, | |
char *y_start, char *y_end, grn_bool with_transposition) | |
{ | |
uint32_t cx, lx, cy, ly; | |
uint32_t *dists; | |
uint32_t d = 0; | |
char *px, *sx = x_start, *ex = x_end; | |
char *py, *sy = y_start, *ey = y_end; | |
for (px = sx, lx = 0; px < ex && (cx = grn_charlen(ctx, px, ex)); px += cx, lx++); | |
for (py = sy, ly = 0; py < ey && (cy = grn_charlen(ctx, py, ey)); py += cy, ly++); | |
if ((dists = malloc((lx + 1) * (ly + 1) * sizeof(uint32_t)))) { | |
uint32_t x, y; | |
for (x = 0; x <= lx; x++) { DIST(x, 0) = x; } | |
for (y = 0; y <= ly; y++) { DIST(0, y) = y; } | |
for (x = 1, px = sx; x <= lx; x++, px += cx) { | |
cx = grn_charlen(ctx, px, ex); | |
for (y = 1, py = sy; y <= ly; y++, py += cy) { | |
cy = grn_charlen(ctx, py, ey); | |
if (cx == cy && !memcmp(px, py, cx)) { | |
DIST(x, y) = DIST(x - 1, y - 1); | |
} else { | |
uint32_t a, b, c; | |
a = DIST(x - 1, y) + 1; /* deletion */ | |
b = DIST(x, y - 1) + 1; /* insertion */ | |
c = DIST(x - 1, y - 1) + 1; /* substitution */ | |
DIST(x, y) = ((a < b) ? ((a < c) ? a : c) : ((b < c) ? b : c)); | |
if (with_transposition && | |
x > 1 && y > 1 && cx == cy | |
&& memcmp(px, py - cy, cx) == 0 | |
&& memcmp(px - cx, py, cx) == 0) { | |
uint32_t t = DIST(x - 2, y - 2) + 1; /* transposition */ | |
DIST(x, y) = ((DIST(x, y) < t) ? DIST(x, y) : t); | |
} | |
} | |
} | |
} | |
d = DIST(lx, ly); | |
free(dists); | |
} | |
return d; | |
} | |
void | |
table_edit_distance(grn_ctx *ctx, grn_obj *table, char *search_key, int search_key_size, int max_distance, grn_bool with_transposition, grn_hash *hash) | |
{ | |
grn_table_cursor *tc; | |
char *x = search_key; | |
char *xe = x + search_key_size; | |
grn_id id; | |
if ((tc = grn_table_cursor_open(ctx, table, NULL, 0, NULL, 0, 0, -1, GRN_CURSOR_BY_ID))) { | |
while ((id = grn_table_cursor_next(ctx, tc))) { | |
void *key; | |
unsigned int key_size, distance; | |
key_size = grn_table_cursor_get_key(ctx, tc, &key); | |
distance = calc_edit_distance(ctx, x, xe, (char *)key, (char *)key + key_size, with_transposition); | |
if (distance <= max_distance) { | |
grn_rset_recinfo *ri; | |
if (grn_hash_add(ctx, (grn_hash *)hash, &id, sizeof(grn_id), (void **)&ri, NULL)) { | |
ri->score = distance; | |
} | |
} | |
} | |
grn_table_cursor_close(ctx, tc); | |
} | |
} | |
int | |
main(int argc, char **argv) | |
{ | |
grn_ctx ctx; | |
grn_obj *db, *table, *words; | |
grn_id id; | |
grn_obj *result; | |
const char *path = "pat.grn"; | |
FILE *fp; | |
char s[GRN_TABLE_MAX_KEY_SIZE]; | |
clock_t start, end; | |
int max_distance; | |
int min_size; | |
grn_bool with_transposition = GRN_FALSE; | |
if (argc < 6) { | |
printf("input_error: file1(compared words) file2(search words) max_distance min_size use_transpositiong(0/1)\n"); | |
return -1; | |
} | |
max_distance = atoi(argv[3]); | |
min_size = atoi(argv[4]); | |
with_transposition = atoi(argv[5]); | |
grn_init(); | |
grn_ctx_init(&ctx, 0); | |
db = grn_db_open(&ctx, path); | |
if (!db) { db = grn_db_create(&ctx, path, NULL); } | |
printf("---- build patricia trie ---\n"); | |
start = clock(); | |
table = grn_table_create(&ctx, NULL, 0, | |
NULL, | |
GRN_OBJ_TABLE_PAT_KEY, | |
grn_ctx_at(&ctx, GRN_DB_SHORT_TEXT), NULL); | |
if ((fp = fopen(argv[1], "r")) == NULL) { | |
printf("file open error!!\n"); | |
return -1; | |
} | |
while (fgets(s, 4096, fp) != NULL) { | |
grn_table_add(&ctx, table, s, strlen(s) - 1, NULL); | |
} | |
fclose(fp); | |
//grn_p(&ctx, table); | |
end = clock(); | |
printf("time = %f[s]\n", (double)(end-start)/CLOCKS_PER_SEC); | |
printf("---- read search words ---\n"); | |
start = clock(); | |
words = grn_table_create(&ctx, NULL, 0, | |
NULL, | |
GRN_OBJ_TABLE_PAT_KEY, | |
grn_ctx_at(&ctx, GRN_DB_SHORT_TEXT), NULL); | |
if ((fp = fopen(argv[2], "r")) == NULL) { | |
printf("file open error!!\n"); | |
return -1; | |
} | |
while (fgets(s, 4096, fp) != NULL) { | |
grn_table_add(&ctx, words, s, strlen(s) - 1, NULL); | |
} | |
fclose(fp); | |
//grn_p(&ctx, words); | |
end = clock(); | |
printf("time = %f[s]\n", (double)(end-start)/CLOCKS_PER_SEC); | |
printf("---- calc edit distance using pat fuzzy search ---\n"); | |
{ | |
void *search_key = NULL; | |
int search_key_size; | |
double sum = 0; | |
GRN_TABLE_EACH(&ctx, words, GRN_ID_NIL, GRN_ID_NIL, | |
result_id, &search_key, &search_key_size, NULL, { | |
grn_obj *hash; | |
hash = grn_table_create(&ctx, NULL, 0, NULL, GRN_OBJ_TABLE_HASH_KEY|GRN_OBJ_WITH_SUBREC, grn_ctx_at(&ctx, GRN_DB_UINT32), NULL); | |
//printf("%.*s\t", search_key_size, (char *)search_key); | |
start = clock(); | |
grn_pat_fuzzy_search(&ctx, (grn_pat *)table, search_key, search_key_size, min_size, max_distance, with_transposition, (grn_hash *)hash); | |
end = clock(); | |
//printf("%d\t", grn_table_size(&ctx, hash)); | |
//printf("%f\n", (double)(end-start)/CLOCKS_PER_SEC); | |
sum += (double)(end - start)/CLOCKS_PER_SEC; | |
output_with_sort_by_score(&ctx, hash, table, search_key, search_key_size, "pat_fuzzy.txt"); | |
grn_obj_unlink(&ctx, hash); | |
}); | |
printf("%s\n", argv[2]); | |
printf("max_distance\tavg\n"); | |
printf("%d\t%f\n", max_distance, sum / grn_table_size(&ctx, words)); | |
} | |
printf("---- calc edit distance each by record ---\n"); | |
{ | |
void *search_key = NULL; | |
int search_key_size; | |
double sum = 0; | |
GRN_TABLE_EACH(&ctx, words, GRN_ID_NIL, GRN_ID_NIL, | |
result_id, &search_key, &search_key_size, NULL, { | |
grn_obj *hash; | |
hash = grn_table_create(&ctx, NULL, 0, NULL, GRN_OBJ_TABLE_HASH_KEY|GRN_OBJ_WITH_SUBREC, grn_ctx_at(&ctx, GRN_DB_UINT32), NULL); | |
//printf("%.*s\t", search_key_size, (char *)search_key); | |
start = clock(); | |
table_edit_distance(&ctx, table, search_key, search_key_size, max_distance, with_transposition, (grn_hash *)hash); | |
end = clock(); | |
//printf("%d\t", grn_table_size(&ctx, hash)); | |
//printf("%f\n", (double)(end-start)/CLOCKS_PER_SEC); | |
sum += (double)(end - start)/CLOCKS_PER_SEC; | |
output_with_sort_by_score(&ctx, hash, table, search_key, search_key_size, "edit.txt"); | |
grn_obj_unlink(&ctx, hash); | |
}); | |
printf("%s\n", argv[2]); | |
printf("max_distance\tavg\n"); | |
printf("%d\t%f\n", max_distance, sum / grn_table_size(&ctx, words)); | |
} | |
grn_obj_unlink(&ctx, words); | |
grn_obj_unlink(&ctx, table); | |
grn_obj_close(&ctx, db); | |
grn_ctx_fin(&ctx); | |
grn_fin(); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment