Created
October 10, 2017 14:17
-
-
Save cipepser/44a00f5d2f3cfeaf00c39b54c41e5d80 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"bufio" | |
"errors" | |
"fmt" | |
"io" | |
"math" | |
"math/rand" | |
"os" | |
"strings" | |
"gonum.org/v1/gonum/mat" | |
"./q71" | |
porterstemmer "github.com/reiver/go-porterstemmer" | |
) | |
func getIndexOfDict(s string, dict []string) int { | |
for i, w := range dict { | |
if w == s { | |
return i | |
} | |
} | |
return -1 | |
} | |
func Sigmoid(v float64) float64 { return 1.0 / (1.0 + math.Exp(-v)) } | |
type Review struct { | |
sentence []string | |
label string | |
} | |
func NewReview(l, s string) Review { | |
r := Review{ | |
label: l, | |
} | |
r.sentence = strings.FieldsFunc(s, func(r rune) bool { | |
return r == ' ' || r == '-' || r == '/' || r == ';' | |
}) | |
return r | |
} | |
func PreProcessing(rs []Review) []Review { | |
for i := range rs { | |
rs[i].sentence = RemoveStopWords(rs[i].sentence) | |
rs[i].sentence = RemoveMetaCharacters(rs[i].sentence) | |
rs[i].sentence = Stemming(rs[i].sentence) | |
} | |
return rs | |
} | |
func RemoveMetaCharacters(input []string) (output []string) { | |
metas := []string{ | |
"`", "[", "]", ";", ":", "/", "*", | |
"(", ")", ".", "\"", ",", "&", "?", | |
"!", "%", "'", | |
} | |
// s = strings.Replace(s, "'", "", -1) | |
for _, s := range input { | |
for _, m := range metas { | |
s = strings.Replace(s, m, "", -1) | |
} | |
if s != "" { | |
output = append(output, s) | |
} | |
} | |
return output | |
} | |
func RemoveStopWords(input []string) (output []string) { | |
for _, s := range input { | |
if !q71.IsStopWords(s) { | |
output = append(output, s) | |
} | |
} | |
return output | |
} | |
func Stemming(str []string) []string { | |
for i := range str { | |
str[i] = porterstemmer.StemString(str[i]) | |
} | |
return str | |
} | |
func makeDictionary(rs []Review) map[string]int { | |
dict := make(map[string]int, 0) | |
for _, r := range rs { | |
for _, w := range r.sentence { | |
dict[w]++ | |
} | |
} | |
// remove noise | |
// for w := range dict { | |
// if dict[w] < 6 { | |
// delete(dict, w) | |
// } | |
// } | |
return dict | |
} | |
func makeFeatureVectors(rs []Review, dict map[string]int) ([]*mat.VecDense, []string) { | |
feature := make([]string, len(dict)+1) | |
i := 0 | |
for w := range dict { | |
feature[i] = w | |
i++ | |
} | |
X := make([]*mat.VecDense, len(rs)) | |
for i, r := range rs { | |
x := make([]float64, len(feature)) | |
for _, w := range r.sentence { | |
idx := getIndexOfDict(w, feature) | |
if idx != -1 { | |
x[idx] = 1 | |
} | |
} | |
x[len(x)-1] = 1 | |
X[i] = mat.NewVecDense(len(x), x) | |
} | |
return X, feature | |
} | |
// LogisticRegression returns w which is the weight vector by logistic regressin. | |
func LogisticRegression(X []*mat.VecDense, labels []string, eta float64) (*mat.VecDense, error) { | |
if len(X) != len(labels) { | |
return nil, errors.New("X and label must have same length.") | |
} | |
// correct label | |
t := make([]float64, len(X)) | |
for i, l := range labels { | |
if l == "+1" { | |
t[i] = 1.0 | |
} | |
} | |
// initialize the parameter | |
ws := make([]float64, X[0].Len()) | |
for i := range ws { | |
ws[i] = rand.Float64() | |
} | |
w := mat.NewVecDense(len(ws), ws) | |
// training | |
for i := range X { | |
x := mat.NewVecDense(X[i].Len(), nil) | |
x.CopyVec(X[i]) | |
p := Sigmoid(mat.Dot(w, x)) | |
x.ScaleVec(eta*(p-t[i]), x) | |
w.SubVec(w, x) | |
eta *= 0.99999 | |
} | |
return w, nil | |
} | |
func Predict(w, x *mat.VecDense) (string, float64) { | |
p := Sigmoid(mat.Dot(w, x)) | |
if p > 0.5 { | |
return "+1", p | |
} | |
return "-1", p | |
} | |
type Feature struct { | |
word string | |
weight float64 | |
} | |
func Divide(X []*mat.VecDense, ls []string, n int) (Xs [][]*mat.VecDense, lss [][]string) { | |
for i := 0; i < n; i++ { | |
s := i * (len(X)/n + 1) | |
e := (i + 1) * (len(X)/n + 1) | |
if e > len(X)-1 { | |
e = len(X) | |
} | |
Xs = append(Xs, X[s:e]) | |
lss = append(lss, ls[s:e]) | |
} | |
return Xs, lss | |
} | |
func main() { | |
rs := []Review{} | |
f, err := os.Open("../data/sentiment.txt") | |
defer f.Close() | |
if err != nil { | |
panic(err) | |
} | |
r := bufio.NewReader(f) | |
for { | |
l, _, err := r.ReadLine() | |
if err == io.EOF { | |
break | |
} | |
if err != nil { | |
panic(err) | |
} | |
// split to label and sentence. | |
str := strings.SplitN(string(l), " ", 2) | |
r := NewReview(str[0], str[1]) | |
rs = append(rs, r) | |
} | |
rs = PreProcessing(rs) | |
dict := makeDictionary(rs) | |
X, _ := makeFeatureVectors(rs, dict) | |
labels := make([]string, len(rs)) | |
for i, r := range rs { | |
labels[i] = r.label | |
} | |
// split the data X to training data and test data. | |
Xs, lss := Divide(X, labels, 5) | |
eta := 0.6 | |
for i := range Xs { | |
trainingData := []*mat.VecDense{} | |
for _, xs := range Xs[:i] { | |
for _, x := range xs { | |
trainingData = append(trainingData, x) | |
} | |
} | |
for _, xs := range Xs[i+1 : len(Xs)] { | |
for _, x := range xs { | |
trainingData = append(trainingData, x) | |
} | |
} | |
trainingLabel := []string{} | |
for _, ls := range lss[:i] { | |
for _, l := range ls { | |
trainingLabel = append(trainingLabel, l) | |
} | |
} | |
for _, ls := range lss[i+1 : len(lss)] { | |
for _, l := range ls { | |
trainingLabel = append(trainingLabel, l) | |
} | |
} | |
w, err := LogisticRegression(trainingData, trainingLabel, eta) | |
if err != nil { | |
panic(err) | |
} | |
correct := 0 | |
actPos := 0 | |
prePos := 0 | |
andPos := 0 | |
for j, x := range Xs[i] { | |
ls := lss[i] | |
ans, _ := Predict(w, x) | |
if ans == ls[j] { | |
correct++ | |
} | |
if ls[j] == "+1" { | |
actPos++ | |
} | |
if ans == "+1" { | |
prePos++ | |
} | |
if ls[j] == "+1" && ans == "+1" { | |
andPos++ | |
} | |
} | |
fmt.Println("**** rates (", i, "/", len(Xs), ") ****") | |
fmt.Println("accuracy rate:\t", float64(correct)/float64(len(Xs[i]))) | |
preRate := float64(andPos) / float64(prePos) | |
fmt.Println("precision rate:\t", preRate) | |
recRate := float64(andPos) / float64(actPos) | |
fmt.Println("recall rate:\t", recRate) | |
fmt.Println("F1 score:\t", 2*(preRate*recRate)/(preRate+recRate)) | |
fmt.Println("") | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment