Skip to content

Instantly share code, notes, and snippets.

@naoa
Last active August 29, 2015 14:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save naoa/8d862028e23e45e23304 to your computer and use it in GitHub Desktop.
Save naoa/8d862028e23e45e23304 to your computer and use it in GitHub Desktop.
gcc src/index_sample.c -o index_sample -Wall -O2 -lgroonga -I/usr/include/groonga
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <groonga.h>
#include <groonga/nfkc.h>
/*
Wikipedia ja 30万件 3.8G
real 1m12.745s
user 0m58.432s
sys 0m1.263s
*/
int
main(int argc, char **argv)
{
grn_ctx ctx;
const char *database_path = argv[1];
const char *table_name = argv[2];
const char *index_column_name = argv[3];
grn_obj *db;
grn_obj *table;
grn_obj *index_column;
grn_obj *hash;
grn_obj *sorted;
grn_init();
grn_ctx_init(&ctx, 0);
db = grn_db_open(&ctx, database_path);
table = grn_ctx_get(&ctx,
table_name,
strlen(table_name));
index_column = grn_obj_column(&ctx,
table,
index_column_name,
strlen(index_column_name));
hash = grn_table_create(&ctx, NULL, 0,
NULL,
GRN_OBJ_TABLE_HASH_KEY,
grn_ctx_at(&ctx, GRN_DB_SHORT_TEXT),
grn_ctx_at(&ctx, GRN_DB_UINT32));
{
grn_table_cursor *cur;
if ((cur = grn_table_cursor_open(&ctx, table, NULL, 0, NULL, 0, 0, -1,
GRN_CURSOR_BY_ID))) {
grn_id id;
while ((id = grn_table_cursor_next(&ctx, cur)) != GRN_ID_NIL) {
grn_obj *index_cursor;
if ((index_cursor = grn_index_cursor_open(&ctx, cur, index_column,
0, -1, GRN_CURSOR_BY_ID))) {
grn_posting *posting;
grn_id term_id = GRN_ID_NIL;
char term[GRN_TABLE_MAX_KEY_SIZE];
int term_length = 0;
grn_obj value;
GRN_UINT32_INIT(&value, 0);
while ((posting = grn_index_cursor_next(&ctx, index_cursor, &term_id))) {
term_length = grn_table_get_key(&ctx,
table,
term_id,
term,
GRN_TABLE_MAX_KEY_SIZE);
if (term_length >= 6) {
grn_char_type char_type;
char_type = grn_nfkc_char_type((unsigned char *)term);
if (char_type == GRN_CHAR_HIRAGANA || char_type == GRN_CHAR_KATAKANA ||
char_type == GRN_CHAR_KANJI) {
grn_id hash_id;
hash_id = grn_table_add(&ctx, hash, term, term_length, NULL);
if (hash_id) {
GRN_BULK_REWIND(&value);
grn_obj_get_value(&ctx, hash, hash_id, &value);
GRN_UINT32_SET(&ctx, &value, GRN_UINT32_VALUE(&value) + posting->tf);
grn_obj_set_value(&ctx, hash, hash_id, &value, GRN_OBJ_SET);
}
}
}
}
grn_obj_unlink(&ctx, &value);
}
grn_obj_unlink(&ctx, index_cursor);
}
}
grn_table_cursor_close(&ctx, cur);
}
{
unsigned int nkeys;
grn_table_sort_key *keys;
const char *sortby_val = "-_value";
unsigned int sortby_len = strlen("-_value");
int offset = 0;
int limit = -1;
sorted = grn_table_create(&ctx, NULL, 0, NULL,
GRN_OBJ_TABLE_NO_KEY, NULL, hash);
keys = grn_table_sort_key_from_str(&ctx, sortby_val, sortby_len, hash, &nkeys);
if (keys) {
grn_table_sort(&ctx, hash, offset, limit, sorted, keys, nkeys);
grn_table_sort_key_close(&ctx, keys, nkeys);
}
}
{
grn_table_cursor *cur;
if ((cur = grn_table_cursor_open(&ctx, sorted, NULL, 0, NULL,
0, 0, -1, GRN_CURSOR_BY_ID))) {
grn_id hash_id;
grn_obj value;
GRN_UINT32_INIT(&value, 0);
while ((hash_id = grn_table_cursor_next(&ctx, cur)) != GRN_ID_NIL) {
unsigned int sorted_key;
grn_table_get_key(&ctx, sorted, hash_id, &sorted_key, sizeof(unsigned int));
{
char key[GRN_TABLE_MAX_KEY_SIZE];
int key_size;
key_size = grn_table_get_key(&ctx, hash, sorted_key, &key, GRN_TABLE_MAX_KEY_SIZE);
GRN_BULK_REWIND(&value);
grn_obj_get_value(&ctx, hash, sorted_key, &value);
printf("%.*s,%d\n", key_size, key, GRN_UINT32_VALUE(&value));
}
}
grn_obj_unlink(&ctx, &value);
}
grn_table_cursor_close(&ctx, cur);
}
grn_obj_unlink(&ctx, hash);
grn_obj_unlink(&ctx, index_column);
grn_obj_unlink(&ctx, table);
grn_obj_unlink(&ctx, sorted);
grn_ctx_fin(&ctx);
grn_fin();
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment