-
-
Save takkkun/5422519 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'mkmf' | |
have_header('mecab.h') && have_library('mecab') && create_makefile('hijiki') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include "ruby.h" | |
#include "ruby/encoding.h" | |
#include "mecab.h" | |
typedef struct { | |
mecab_t *mecab; | |
rb_encoding *encoding; | |
} hijiki_t; | |
void hijiki_free(hijiki_t *hijiki) { | |
if (hijiki->mecab != NULL) { | |
mecab_destroy(hijiki->mecab); | |
} | |
} | |
static VALUE hijiki_alloc(VALUE klass) { | |
hijiki_t *hijiki = ALLOC(hijiki_t); | |
return Data_Wrap_Struct(klass, 0, hijiki_free, hijiki); | |
} | |
VALUE hijiki_initialize(int argc, VALUE *argv, VALUE self) { | |
char **options = ALLOC_N(char*, argc); | |
int i; | |
for (i = 0; i < argc; ++i) { | |
options[i] = RSTRING_PTR(argv[i]); | |
} | |
hijiki_t *hijiki; | |
Data_Get_Struct(self, hijiki_t, hijiki); | |
hijiki->mecab = mecab_new(argc, options); | |
hijiki->encoding = rb_enc_find("utf-8"); | |
} | |
VALUE hijiki_parse(VALUE self, VALUE text) { | |
hijiki_t *hijiki; | |
Data_Get_Struct(self, hijiki_t, hijiki); | |
const char *output = mecab_sparse_tostr(hijiki->mecab, RSTRING_PTR(text)); | |
return rb_enc_str_new(output, strlen(output), hijiki->encoding); | |
} | |
void Init_Hijiki() { | |
VALUE cHijiki = rb_define_class("Hijiki", rb_cObject); | |
rb_define_alloc_func(cHijiki, hijiki_alloc); | |
rb_define_method(cHijiki, "initialize", hijiki_initialize, -1); | |
rb_define_method(cHijiki, "parse", hijiki_parse, 1); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
require 'benchmark' | |
require 'MeCab' | |
require 'natto' | |
require 'ffi' | |
require './Hijiki' | |
# Natto の parse のボトルネック解消したバージョン | |
class DaizuNatto < Natto::MeCab | |
def parse(str) | |
raise ArgumentError.new 'String to parse cannot be nil' if str.nil? | |
mecab_sparse_tostr(@tagger, str) | |
.force_encoding(Encoding.default_external) | |
end | |
end | |
# 自分で作ったバージョン | |
class Negitoro | |
extend FFI::Library | |
ffi_lib 'mecab' | |
attach_function :mecab_new2, [:string], :pointer | |
attach_function :mecab_sparse_tostr, [:pointer, :string], :string | |
attach_function :mecab_destroy, [:pointer], :void | |
def self.clean_proc(tagger) | |
-> { mecab_destroy tagger } | |
end | |
def initialize(option = "") | |
@tagger = mecab_new2 option | |
ObjectSpace.define_finalizer self, self.class.clean_proc(@tagger) | |
end | |
def parse(str) | |
raise ArgumentError.new 'String to parse cannot be nil' if str.nil? | |
mecab_sparse_tostr(@tagger, str) | |
.force_encoding(Encoding.default_external) | |
end | |
end | |
# ベンチマーク | |
def do_parse(tagger) | |
10000.times { tagger.parse("太郎はこの本を二郎を見た女性に渡した。") } | |
end | |
dicdir = File.dirname(`mecab -D`.each_line.grep(/^filename:/) { |s| s.sub(/^filename:\t/, '').chomp }.first) | |
Benchmark.bmbm(10) do |x| | |
x.report("和布蕪") { do_parse(MeCab::Tagger.new) } | |
x.report("納豆") { do_parse(Natto::MeCab.new) } | |
x.report("大豆納豆") { do_parse(DaizuNatto.new) } | |
x.report("ネギトロ") { do_parse(Negitoro.new) } | |
x.report("鹿尾菜") { do_parse(Hijiki.new('-O', 'wakati', '-d', dicdir)) } | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment