Skip to content

Instantly share code, notes, and snippets.

@lestrrat
Created March 10, 2014 05:00
Show Gist options
  • Save lestrrat/9459727 to your computer and use it in GitHub Desktop.
Save lestrrat/9459727 to your computer and use it in GitHub Desktop.
俺俺 golang mecabバインディング(mecabで日本語をトークナイズするためのミニマルなヤツ)
package tokenizer
/*
#cgo CFLAGS: XXX CHANGE ME XXX
#cfo LDFALGS: XXX CHANGE ME XXX
#include <mecab.h>
struct mecab_t {}
*/
import "C"
import "errors"
var ErrNoMoreTokens = errors.New("No more tokens")
type Iterator interface {
Next() (string, err)
}
type Tokenizer interface {
Tokenize(string) Iterator
}
type TokenizeMecab struct {
mecab *C.mecab_t
}
type TokenizeMecabIter interface {
root *C.mecab_node_t
current *C.struct_mecab_node_t
}
func NewMecab(s string) *TokenizeMecab {
return &TokenizeMecab { C.mecab_new2(C.CString(s)) }
}
func (t *TokenizeMecab) Tokenize(input string) *TokenizeMecabIter {
p := C.CString(input)
node := C.mecab_sparse_tonode(t.mecab, p)
return &TokenizeMecabIterator { node, node.next }
}
func (iter *TokenizeMecabIter) Next() (string, error) {
if t.current == nil {
return "", ErrNoMoreTokens
}
node := iter.current
iter.current = iter.current.next
s := C.GoString(node.surface)
return s[:int(node.length)], nil
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment