Skip to content

Instantly share code, notes, and snippets.

@tma15
Last active August 29, 2015 14:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tma15/094abc128ad62e16cfed to your computer and use it in GitHub Desktop.
Save tma15/094abc128ad62e16cfed to your computer and use it in GitHub Desktop.
Simple Feature Vectorizer
package main
// #cgo LDFLAGS: -L/usr/local/Cellar/mecab/0.996/lib -lmecab -lstdc++ `mecab-config --libs`で得られる結果
// #cgo CFLAGS: -I/usr/local/Cellar/mecab/0.996/include `mecab-config --cflags`で得られる結果
// #include <mecab.h>
import "C"
// import "fmt"
import "strings"
type MeCabTokenizer struct {
mecab *C.mecab_t
}
func NewMeCabTokenizer() *MeCabTokenizer {
t := C.mecab_new2(C.CString(""))
return &MeCabTokenizer{t}
}
func (t *MeCabTokenizer) Tokenize(s string) string {
r := C.mecab_sparse_tostr(t.mecab, C.CString(s))
return C.GoString(r)
}
func GetWords(s string, targetPos []string) []string {
ns := []string{}
s = strings.Trim(s, "\n")
text := strings.Split(s, "\n")
for _, line := range text {
if line == "EOS" {
break
}
split := strings.Split(line, "\t")
surface := split[0]
info := split[1]
splitInfo := strings.Split(info, ",")
pos := splitInfo[0]
for _, p := range targetPos {
if pos == p {
ns = append(ns, surface)
continue
}
}
}
return ns
}
package main
import (
"bufio"
"fmt"
"io"
"os"
"strings"
)
func Text2Fv(text string, t *MeCabTokenizer, pos []string) map[string]float64 {
fv := map[string]float64{}
sents := SplitToSents(text)
numSent := len(sents) - 1
for i := 0; i < numSent; i++ {
p := t.Tokenize(sents[i])
words := GetWords(p, pos)
for _, w := range words {
if _, ok := fv[w]; !ok {
fv[w] = 1.0
}
}
}
return fv
}
func SplitToSents(text string) []string {
return strings.Split(text, "。")
}
func PrintFv(x map[string]float64, y string) {
vec := []string{}
for k, v := range x {
vec = append(vec, fmt.Sprintf("%s:%f", k, v))
}
vectext := strings.Join(vec, " ")
text := fmt.Sprintf("%s %s", y, vectext)
fmt.Println(text)
}
func ReadCSV(fname string) ([]string, []string) {
var fp *os.File
var err error
fp, err = os.Open(fname)
if err != nil {
panic(err)
}
defer fp.Close()
X := []string{}
y := []string{}
reader := bufio.NewReaderSize(fp, 4096*10)
for {
line, _, err := reader.ReadLine()
if err == io.EOF {
break
} else if err != nil {
panic(err)
}
split := strings.SplitN(string(line), ",", 2)
if len(split) != 2 {
panic(fmt.Sprintf("Invalid line: %s", line))
}
x_i := split[1]
y_i := split[0]
X = append(X, x_i)
y = append(y, y_i)
}
return X, y
}
func main() {
X, y := ReadCSV(os.Args[1])
numData := len(X)
pos := []string{"名詞", "動詞"}
t := NewMeCabTokenizer()
for i := 0; i < numData; i++ {
fv := Text2Fv(X[i], t, pos)
PrintFv(fv, y[i])
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment