Skip to content

Instantly share code, notes, and snippets.

@cipepser
Created October 10, 2017 14:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save cipepser/44a00f5d2f3cfeaf00c39b54c41e5d80 to your computer and use it in GitHub Desktop.
Save cipepser/44a00f5d2f3cfeaf00c39b54c41e5d80 to your computer and use it in GitHub Desktop.
package main
import (
"bufio"
"errors"
"fmt"
"io"
"math"
"math/rand"
"os"
"strings"
"gonum.org/v1/gonum/mat"
"./q71"
porterstemmer "github.com/reiver/go-porterstemmer"
)
func getIndexOfDict(s string, dict []string) int {
for i, w := range dict {
if w == s {
return i
}
}
return -1
}
func Sigmoid(v float64) float64 { return 1.0 / (1.0 + math.Exp(-v)) }
type Review struct {
sentence []string
label string
}
func NewReview(l, s string) Review {
r := Review{
label: l,
}
r.sentence = strings.FieldsFunc(s, func(r rune) bool {
return r == ' ' || r == '-' || r == '/' || r == ';'
})
return r
}
func PreProcessing(rs []Review) []Review {
for i := range rs {
rs[i].sentence = RemoveStopWords(rs[i].sentence)
rs[i].sentence = RemoveMetaCharacters(rs[i].sentence)
rs[i].sentence = Stemming(rs[i].sentence)
}
return rs
}
func RemoveMetaCharacters(input []string) (output []string) {
metas := []string{
"`", "[", "]", ";", ":", "/", "*",
"(", ")", ".", "\"", ",", "&", "?",
"!", "%", "'",
}
// s = strings.Replace(s, "'", "", -1)
for _, s := range input {
for _, m := range metas {
s = strings.Replace(s, m, "", -1)
}
if s != "" {
output = append(output, s)
}
}
return output
}
func RemoveStopWords(input []string) (output []string) {
for _, s := range input {
if !q71.IsStopWords(s) {
output = append(output, s)
}
}
return output
}
func Stemming(str []string) []string {
for i := range str {
str[i] = porterstemmer.StemString(str[i])
}
return str
}
func makeDictionary(rs []Review) map[string]int {
dict := make(map[string]int, 0)
for _, r := range rs {
for _, w := range r.sentence {
dict[w]++
}
}
// remove noise
// for w := range dict {
// if dict[w] < 6 {
// delete(dict, w)
// }
// }
return dict
}
func makeFeatureVectors(rs []Review, dict map[string]int) ([]*mat.VecDense, []string) {
feature := make([]string, len(dict)+1)
i := 0
for w := range dict {
feature[i] = w
i++
}
X := make([]*mat.VecDense, len(rs))
for i, r := range rs {
x := make([]float64, len(feature))
for _, w := range r.sentence {
idx := getIndexOfDict(w, feature)
if idx != -1 {
x[idx] = 1
}
}
x[len(x)-1] = 1
X[i] = mat.NewVecDense(len(x), x)
}
return X, feature
}
// LogisticRegression returns w which is the weight vector by logistic regressin.
func LogisticRegression(X []*mat.VecDense, labels []string, eta float64) (*mat.VecDense, error) {
if len(X) != len(labels) {
return nil, errors.New("X and label must have same length.")
}
// correct label
t := make([]float64, len(X))
for i, l := range labels {
if l == "+1" {
t[i] = 1.0
}
}
// initialize the parameter
ws := make([]float64, X[0].Len())
for i := range ws {
ws[i] = rand.Float64()
}
w := mat.NewVecDense(len(ws), ws)
// training
for i := range X {
x := mat.NewVecDense(X[i].Len(), nil)
x.CopyVec(X[i])
p := Sigmoid(mat.Dot(w, x))
x.ScaleVec(eta*(p-t[i]), x)
w.SubVec(w, x)
eta *= 0.99999
}
return w, nil
}
func Predict(w, x *mat.VecDense) (string, float64) {
p := Sigmoid(mat.Dot(w, x))
if p > 0.5 {
return "+1", p
}
return "-1", p
}
type Feature struct {
word string
weight float64
}
func Divide(X []*mat.VecDense, ls []string, n int) (Xs [][]*mat.VecDense, lss [][]string) {
for i := 0; i < n; i++ {
s := i * (len(X)/n + 1)
e := (i + 1) * (len(X)/n + 1)
if e > len(X)-1 {
e = len(X)
}
Xs = append(Xs, X[s:e])
lss = append(lss, ls[s:e])
}
return Xs, lss
}
func main() {
rs := []Review{}
f, err := os.Open("../data/sentiment.txt")
defer f.Close()
if err != nil {
panic(err)
}
r := bufio.NewReader(f)
for {
l, _, err := r.ReadLine()
if err == io.EOF {
break
}
if err != nil {
panic(err)
}
// split to label and sentence.
str := strings.SplitN(string(l), " ", 2)
r := NewReview(str[0], str[1])
rs = append(rs, r)
}
rs = PreProcessing(rs)
dict := makeDictionary(rs)
X, _ := makeFeatureVectors(rs, dict)
labels := make([]string, len(rs))
for i, r := range rs {
labels[i] = r.label
}
// split the data X to training data and test data.
Xs, lss := Divide(X, labels, 5)
eta := 0.6
for i := range Xs {
trainingData := []*mat.VecDense{}
for _, xs := range Xs[:i] {
for _, x := range xs {
trainingData = append(trainingData, x)
}
}
for _, xs := range Xs[i+1 : len(Xs)] {
for _, x := range xs {
trainingData = append(trainingData, x)
}
}
trainingLabel := []string{}
for _, ls := range lss[:i] {
for _, l := range ls {
trainingLabel = append(trainingLabel, l)
}
}
for _, ls := range lss[i+1 : len(lss)] {
for _, l := range ls {
trainingLabel = append(trainingLabel, l)
}
}
w, err := LogisticRegression(trainingData, trainingLabel, eta)
if err != nil {
panic(err)
}
correct := 0
actPos := 0
prePos := 0
andPos := 0
for j, x := range Xs[i] {
ls := lss[i]
ans, _ := Predict(w, x)
if ans == ls[j] {
correct++
}
if ls[j] == "+1" {
actPos++
}
if ans == "+1" {
prePos++
}
if ls[j] == "+1" && ans == "+1" {
andPos++
}
}
fmt.Println("**** rates (", i, "/", len(Xs), ") ****")
fmt.Println("accuracy rate:\t", float64(correct)/float64(len(Xs[i])))
preRate := float64(andPos) / float64(prePos)
fmt.Println("precision rate:\t", preRate)
recRate := float64(andPos) / float64(actPos)
fmt.Println("recall rate:\t", recRate)
fmt.Println("F1 score:\t", 2*(preRate*recRate)/(preRate+recRate))
fmt.Println("")
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment