Skip to content

Instantly share code, notes, and snippets.

@cipepser
Created October 9, 2017 01:43
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save cipepser/3159bfb9a6735815106e74f20eba411d to your computer and use it in GitHub Desktop.
Save cipepser/3159bfb9a6735815106e74f20eba411d to your computer and use it in GitHub Desktop.
package main
import (
"bufio"
"errors"
"fmt"
"io"
"math"
"math/rand"
"os"
"sort"
"strings"
"gonum.org/v1/gonum/mat"
"./q71"
porterstemmer "github.com/reiver/go-porterstemmer"
)
func getIndexOfDict(s string, dict []string) int {
for i, w := range dict {
if w == s {
return i
}
}
return -1
}
func Sigmoid(v float64) float64 { return 1.0 / (1.0 + math.Exp(-v)) }
type Review struct {
sentence []string
label string
}
func NewReview(l, s string) Review {
r := Review{
label: l,
}
r.sentence = strings.FieldsFunc(s, func(r rune) bool {
return r == ' ' || r == '-' || r == '/' || r == ';'
})
return r
}
func PreProcessing(rs []Review) []Review {
for i := range rs {
rs[i].sentence = RemoveStopWords(rs[i].sentence)
rs[i].sentence = RemoveMetaCharacters(rs[i].sentence)
rs[i].sentence = Stemming(rs[i].sentence)
}
return rs
}
func RemoveMetaCharacters(input []string) (output []string) {
metas := []string{
"`", "[", "]", ";", ":", "/", "*",
"(", ")", ".", "\"", ",", "&", "?",
"!", "%", "'",
}
// s = strings.Replace(s, "'", "", -1)
for _, s := range input {
for _, m := range metas {
s = strings.Replace(s, m, "", -1)
}
if s != "" {
output = append(output, s)
}
}
return output
}
func RemoveStopWords(input []string) (output []string) {
for _, s := range input {
if !q71.IsStopWords(s) {
output = append(output, s)
}
}
return output
}
func Stemming(str []string) []string {
for i := range str {
str[i] = porterstemmer.StemString(str[i])
}
return str
}
func makeDictionary(rs []Review) map[string]int {
dict := make(map[string]int, 0)
for _, r := range rs {
for _, w := range r.sentence {
dict[w]++
}
}
// remove noise
// for w := range dict {
// if dict[w] < 6 {
// delete(dict, w)
// }
// }
return dict
}
func makeFeatureVectors(rs []Review, dict map[string]int) ([]*mat.VecDense, []string) {
feature := make([]string, len(dict)+1)
i := 0
for w := range dict {
feature[i] = w
i++
}
X := make([]*mat.VecDense, len(rs))
for i, r := range rs {
x := make([]float64, len(feature))
for _, w := range r.sentence {
idx := getIndexOfDict(w, feature)
if idx != -1 {
x[idx] = 1
}
}
x[len(x)-1] = 1
X[i] = mat.NewVecDense(len(x), x)
}
return X, feature
}
// LogisticRegression returns w which is the weight vector by logistic regressin.
func LogisticRegression(X []*mat.VecDense, labels []string, eta float64) (*mat.VecDense, error) {
if len(X) != len(labels) {
return nil, errors.New("X and label must have same length.")
}
// correct label
t := make([]float64, len(X))
for i, l := range labels {
if l == "+1" {
t[i] = 1.0
}
}
// initialize the parameter
ws := make([]float64, X[0].Len())
for i := range ws {
ws[i] = rand.Float64()
}
w := mat.NewVecDense(len(ws), ws)
// training
for i := range X {
x := mat.NewVecDense(X[i].Len(), nil)
x.CopyVec(X[i])
p := Sigmoid(mat.Dot(w, x))
x.ScaleVec(eta*(p-t[i]), x)
w.SubVec(w, x)
eta *= 0.99999
}
return w, nil
}
func Predict(w, x *mat.VecDense) (string, float64) {
p := Sigmoid(mat.Dot(w, x))
if p > 0.5 {
return "+1", p
}
return "-1", p
}
type Feature struct {
word string
weight float64
}
func main() {
rs := []Review{}
f, err := os.Open("../data/sentiment.txt")
defer f.Close()
if err != nil {
panic(err)
}
r := bufio.NewReader(f)
for {
l, _, err := r.ReadLine()
if err == io.EOF {
break
}
if err != nil {
panic(err)
}
// split to label and sentence.
str := strings.SplitN(string(l), " ", 2)
r := NewReview(str[0], str[1])
rs = append(rs, r)
}
// q72
rs = PreProcessing(rs)
dict := makeDictionary(rs)
X, feature := makeFeatureVectors(rs, dict)
// q73
labels := make([]string, len(rs))
for i, r := range rs {
labels[i] = r.label
}
eta := 0.6
w, err := LogisticRegression(X, labels, eta)
if err != nil {
panic(err)
}
// q74, 76
fmt.Println("**** labeling and the probablity ****")
fmt.Println("No.\tcorrect\tpredicted\tprobability")
fmt.Println("--------------------------------------------")
for i, x := range X[:10] {
ans, p := Predict(w, x)
fmt.Println("[", i, "]\t", labels[i], "\t", ans, "\t", p)
}
fmt.Println("")
// q75
features := make([]Feature, w.Len())
for i := 0; i < w.Len(); i++ {
features[i].weight = w.At(i, 0)
features[i].word = feature[i]
}
sort.Slice(features, func(i, j int) bool {
return features[i].weight < features[j].weight
})
fmt.Println("**** 10 highest weighted words ****")
fmt.Println("rank\tword\tweight")
fmt.Println("--------------------------------------------")
for i := 0; i < 10; i++ {
fmt.Println("[", i, "]\t", features[len(features)-i-1].word,
"\t", features[len(features)-i-1].weight)
}
fmt.Println("")
fmt.Println("**** 10 lowest weighted words ****")
fmt.Println("rank\tword\tweight")
fmt.Println("--------------------------------------------")
for i := 0; i < 10; i++ {
fmt.Println("[", i, "]\t", features[i].word, "\t", features[i].weight)
}
fmt.Println("")
// q77
correct := 0
actPos := 0
prePos := 0
andPos := 0
for i, x := range X {
ans, _ := Predict(w, x)
if ans == labels[i] {
correct++
}
if labels[i] == "+1" {
actPos++
}
if ans == "+1" {
prePos++
}
if labels[i] == "+1" && ans == "+1" {
andPos++
}
}
fmt.Println("**** rates ****")
fmt.Println("accuracy rate:\t", float64(correct)/float64(len(X)))
preRate := float64(andPos) / float64(prePos)
fmt.Println("precision rate:\t", preRate)
recRate := float64(andPos) / float64(actPos)
fmt.Println("recall rate:\t", recRate)
fmt.Println("F1 score:\t", 2*(preRate*recRate)/(preRate+recRate))
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment