Skip to content

Instantly share code, notes, and snippets.

@naoa
Created January 31, 2016 17:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save naoa/934da65118b7bbc99178 to your computer and use it in GitHub Desktop.
Save naoa/934da65118b7bbc99178 to your computer and use it in GitHub Desktop.
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <groonga.h>
#include <time.h>
#include <limits.h>
typedef struct {
double score;
int n_subrecs;
int subrecs[1];
} grn_rset_recinfo;
void
output_with_sort_by_score(grn_ctx *ctx, grn_obj *hash, grn_obj *table,
const char *search_key, int len, const char *path)
{
uint32_t nkeys;
grn_table_sort_key *keys;
grn_obj *sorted;
FILE *fp;
if ((fp = fopen(path, "a")) == NULL) {
printf("file open error!!\n");
return;
}
sorted = grn_table_create(ctx, NULL, 0, NULL, GRN_OBJ_TABLE_NO_KEY, NULL, hash);
if ((keys = grn_table_sort_key_from_str(ctx, "_score", strlen("_score"), hash, &nkeys))) {
grn_table_sort(ctx, hash, 0, -1, sorted, keys, nkeys);
grn_table_cursor *tc;
if ((tc = grn_table_cursor_open(ctx, sorted, NULL, 0, NULL, 0, 0, -1, GRN_CURSOR_BY_ID))) {
grn_id id;
grn_obj *column = grn_obj_column(ctx, sorted,
GRN_COLUMN_NAME_KEY,
GRN_COLUMN_NAME_KEY_LEN);
fprintf(fp, "%.*s\t%d\t", len, search_key, grn_table_size(ctx, hash));
while ((id = grn_table_cursor_next(ctx, tc))) {
grn_id *sid, oid;
grn_rset_recinfo ri;
char key[GRN_TABLE_MAX_KEY_SIZE];
unsigned int key_size;
grn_table_cursor_get_value(ctx, tc, (void **)&sid);
grn_hash_get_value(ctx, (grn_hash *)hash, *sid, (void *)&ri);
grn_hash_get_key(ctx, (grn_hash *)hash, *sid, (void *)&oid, sizeof(grn_id));
key_size = grn_table_get_key(ctx, table, oid, key, GRN_TABLE_MAX_KEY_SIZE);
//fprintf(fp, "%d:", (int)ri.score);
//fprintf(fp, "%.*s ", key_size, key);
}
fprintf(fp, "\n");
grn_obj_unlink(ctx, column);
grn_table_cursor_close(ctx, tc);
}
grn_table_sort_key_close(ctx, keys, nkeys);
}
fclose(fp);
}
#define DIST(ox,oy) (dists[((lx + 1) * (oy)) + (ox)])
uint32_t
calc_edit_distance(grn_ctx *ctx, char *x_start, char *x_end,
char *y_start, char *y_end, grn_bool with_transposition)
{
uint32_t cx, lx, cy, ly;
uint32_t *dists;
uint32_t d = 0;
char *px, *sx = x_start, *ex = x_end;
char *py, *sy = y_start, *ey = y_end;
for (px = sx, lx = 0; px < ex && (cx = grn_charlen(ctx, px, ex)); px += cx, lx++);
for (py = sy, ly = 0; py < ey && (cy = grn_charlen(ctx, py, ey)); py += cy, ly++);
if ((dists = malloc((lx + 1) * (ly + 1) * sizeof(uint32_t)))) {
uint32_t x, y;
for (x = 0; x <= lx; x++) { DIST(x, 0) = x; }
for (y = 0; y <= ly; y++) { DIST(0, y) = y; }
for (x = 1, px = sx; x <= lx; x++, px += cx) {
cx = grn_charlen(ctx, px, ex);
for (y = 1, py = sy; y <= ly; y++, py += cy) {
cy = grn_charlen(ctx, py, ey);
if (cx == cy && !memcmp(px, py, cx)) {
DIST(x, y) = DIST(x - 1, y - 1);
} else {
uint32_t a, b, c;
a = DIST(x - 1, y) + 1; /* deletion */
b = DIST(x, y - 1) + 1; /* insertion */
c = DIST(x - 1, y - 1) + 1; /* substitution */
DIST(x, y) = ((a < b) ? ((a < c) ? a : c) : ((b < c) ? b : c));
if (with_transposition &&
x > 1 && y > 1 && cx == cy
&& memcmp(px, py - cy, cx) == 0
&& memcmp(px - cx, py, cx) == 0) {
uint32_t t = DIST(x - 2, y - 2) + 1; /* transposition */
DIST(x, y) = ((DIST(x, y) < t) ? DIST(x, y) : t);
}
}
}
}
d = DIST(lx, ly);
free(dists);
}
return d;
}
void
table_edit_distance(grn_ctx *ctx, grn_obj *table, char *search_key, int search_key_size, int max_distance, grn_bool with_transposition, grn_hash *hash)
{
grn_table_cursor *tc;
char *x = search_key;
char *xe = x + search_key_size;
grn_id id;
if ((tc = grn_table_cursor_open(ctx, table, NULL, 0, NULL, 0, 0, -1, GRN_CURSOR_BY_ID))) {
while ((id = grn_table_cursor_next(ctx, tc))) {
void *key;
unsigned int key_size, distance;
key_size = grn_table_cursor_get_key(ctx, tc, &key);
distance = calc_edit_distance(ctx, x, xe, (char *)key, (char *)key + key_size, with_transposition);
if (distance <= max_distance) {
grn_rset_recinfo *ri;
if (grn_hash_add(ctx, (grn_hash *)hash, &id, sizeof(grn_id), (void **)&ri, NULL)) {
ri->score = distance;
}
}
}
grn_table_cursor_close(ctx, tc);
}
}
int
main(int argc, char **argv)
{
grn_ctx ctx;
grn_obj *db, *table, *words;
grn_id id;
grn_obj *result;
const char *path = "pat.grn";
FILE *fp;
char s[GRN_TABLE_MAX_KEY_SIZE];
clock_t start, end;
int max_distance;
int min_size;
grn_bool with_transposition = GRN_FALSE;
if (argc < 6) {
printf("input_error: file1(compared words) file2(search words) max_distance min_size use_transpositiong(0/1)\n");
return -1;
}
max_distance = atoi(argv[3]);
min_size = atoi(argv[4]);
with_transposition = atoi(argv[5]);
grn_init();
grn_ctx_init(&ctx, 0);
db = grn_db_open(&ctx, path);
if (!db) { db = grn_db_create(&ctx, path, NULL); }
printf("---- build patricia trie ---\n");
start = clock();
table = grn_table_create(&ctx, NULL, 0,
NULL,
GRN_OBJ_TABLE_PAT_KEY,
grn_ctx_at(&ctx, GRN_DB_SHORT_TEXT), NULL);
if ((fp = fopen(argv[1], "r")) == NULL) {
printf("file open error!!\n");
return -1;
}
while (fgets(s, 4096, fp) != NULL) {
grn_table_add(&ctx, table, s, strlen(s) - 1, NULL);
}
fclose(fp);
//grn_p(&ctx, table);
end = clock();
printf("time = %f[s]\n", (double)(end-start)/CLOCKS_PER_SEC);
printf("---- read search words ---\n");
start = clock();
words = grn_table_create(&ctx, NULL, 0,
NULL,
GRN_OBJ_TABLE_PAT_KEY,
grn_ctx_at(&ctx, GRN_DB_SHORT_TEXT), NULL);
if ((fp = fopen(argv[2], "r")) == NULL) {
printf("file open error!!\n");
return -1;
}
while (fgets(s, 4096, fp) != NULL) {
grn_table_add(&ctx, words, s, strlen(s) - 1, NULL);
}
fclose(fp);
//grn_p(&ctx, words);
end = clock();
printf("time = %f[s]\n", (double)(end-start)/CLOCKS_PER_SEC);
printf("---- calc edit distance using pat fuzzy search ---\n");
{
void *search_key = NULL;
int search_key_size;
double sum = 0;
GRN_TABLE_EACH(&ctx, words, GRN_ID_NIL, GRN_ID_NIL,
result_id, &search_key, &search_key_size, NULL, {
grn_obj *hash;
hash = grn_table_create(&ctx, NULL, 0, NULL, GRN_OBJ_TABLE_HASH_KEY|GRN_OBJ_WITH_SUBREC, grn_ctx_at(&ctx, GRN_DB_UINT32), NULL);
//printf("%.*s\t", search_key_size, (char *)search_key);
start = clock();
grn_pat_fuzzy_search(&ctx, (grn_pat *)table, search_key, search_key_size, min_size, max_distance, with_transposition, (grn_hash *)hash);
end = clock();
//printf("%d\t", grn_table_size(&ctx, hash));
//printf("%f\n", (double)(end-start)/CLOCKS_PER_SEC);
sum += (double)(end - start)/CLOCKS_PER_SEC;
output_with_sort_by_score(&ctx, hash, table, search_key, search_key_size, "pat_fuzzy.txt");
grn_obj_unlink(&ctx, hash);
});
printf("%s\n", argv[2]);
printf("max_distance\tavg\n");
printf("%d\t%f\n", max_distance, sum / grn_table_size(&ctx, words));
}
printf("---- calc edit distance each by record ---\n");
{
void *search_key = NULL;
int search_key_size;
double sum = 0;
GRN_TABLE_EACH(&ctx, words, GRN_ID_NIL, GRN_ID_NIL,
result_id, &search_key, &search_key_size, NULL, {
grn_obj *hash;
hash = grn_table_create(&ctx, NULL, 0, NULL, GRN_OBJ_TABLE_HASH_KEY|GRN_OBJ_WITH_SUBREC, grn_ctx_at(&ctx, GRN_DB_UINT32), NULL);
//printf("%.*s\t", search_key_size, (char *)search_key);
start = clock();
table_edit_distance(&ctx, table, search_key, search_key_size, max_distance, with_transposition, (grn_hash *)hash);
end = clock();
//printf("%d\t", grn_table_size(&ctx, hash));
//printf("%f\n", (double)(end-start)/CLOCKS_PER_SEC);
sum += (double)(end - start)/CLOCKS_PER_SEC;
output_with_sort_by_score(&ctx, hash, table, search_key, search_key_size, "edit.txt");
grn_obj_unlink(&ctx, hash);
});
printf("%s\n", argv[2]);
printf("max_distance\tavg\n");
printf("%d\t%f\n", max_distance, sum / grn_table_size(&ctx, words));
}
grn_obj_unlink(&ctx, words);
grn_obj_unlink(&ctx, table);
grn_obj_close(&ctx, db);
grn_ctx_fin(&ctx);
grn_fin();
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment