Last active
August 29, 2015 14:07
-
-
Save tma15/094abc128ad62e16cfed to your computer and use it in GitHub Desktop.
Simple Feature Vectorizer
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
// #cgo LDFLAGS: -L/usr/local/Cellar/mecab/0.996/lib -lmecab -lstdc++ `mecab-config --libs`で得られる結果 | |
// #cgo CFLAGS: -I/usr/local/Cellar/mecab/0.996/include `mecab-config --cflags`で得られる結果 | |
// #include <mecab.h> | |
import "C" | |
// import "fmt" | |
import "strings" | |
type MeCabTokenizer struct { | |
mecab *C.mecab_t | |
} | |
func NewMeCabTokenizer() *MeCabTokenizer { | |
t := C.mecab_new2(C.CString("")) | |
return &MeCabTokenizer{t} | |
} | |
func (t *MeCabTokenizer) Tokenize(s string) string { | |
r := C.mecab_sparse_tostr(t.mecab, C.CString(s)) | |
return C.GoString(r) | |
} | |
func GetWords(s string, targetPos []string) []string { | |
ns := []string{} | |
s = strings.Trim(s, "\n") | |
text := strings.Split(s, "\n") | |
for _, line := range text { | |
if line == "EOS" { | |
break | |
} | |
split := strings.Split(line, "\t") | |
surface := split[0] | |
info := split[1] | |
splitInfo := strings.Split(info, ",") | |
pos := splitInfo[0] | |
for _, p := range targetPos { | |
if pos == p { | |
ns = append(ns, surface) | |
continue | |
} | |
} | |
} | |
return ns | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"bufio" | |
"fmt" | |
"io" | |
"os" | |
"strings" | |
) | |
func Text2Fv(text string, t *MeCabTokenizer, pos []string) map[string]float64 { | |
fv := map[string]float64{} | |
sents := SplitToSents(text) | |
numSent := len(sents) - 1 | |
for i := 0; i < numSent; i++ { | |
p := t.Tokenize(sents[i]) | |
words := GetWords(p, pos) | |
for _, w := range words { | |
if _, ok := fv[w]; !ok { | |
fv[w] = 1.0 | |
} | |
} | |
} | |
return fv | |
} | |
func SplitToSents(text string) []string { | |
return strings.Split(text, "。") | |
} | |
func PrintFv(x map[string]float64, y string) { | |
vec := []string{} | |
for k, v := range x { | |
vec = append(vec, fmt.Sprintf("%s:%f", k, v)) | |
} | |
vectext := strings.Join(vec, " ") | |
text := fmt.Sprintf("%s %s", y, vectext) | |
fmt.Println(text) | |
} | |
func ReadCSV(fname string) ([]string, []string) { | |
var fp *os.File | |
var err error | |
fp, err = os.Open(fname) | |
if err != nil { | |
panic(err) | |
} | |
defer fp.Close() | |
X := []string{} | |
y := []string{} | |
reader := bufio.NewReaderSize(fp, 4096*10) | |
for { | |
line, _, err := reader.ReadLine() | |
if err == io.EOF { | |
break | |
} else if err != nil { | |
panic(err) | |
} | |
split := strings.SplitN(string(line), ",", 2) | |
if len(split) != 2 { | |
panic(fmt.Sprintf("Invalid line: %s", line)) | |
} | |
x_i := split[1] | |
y_i := split[0] | |
X = append(X, x_i) | |
y = append(y, y_i) | |
} | |
return X, y | |
} | |
func main() { | |
X, y := ReadCSV(os.Args[1]) | |
numData := len(X) | |
pos := []string{"名詞", "動詞"} | |
t := NewMeCabTokenizer() | |
for i := 0; i < numData; i++ { | |
fv := Text2Fv(X[i], t, pos) | |
PrintFv(fv, y[i]) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment