caongocthai/Naivebayes.go

## main.go
package main

import (
	"bufio"
	"fmt"
	"log"
	"os"
	"strings"
)

// dataset returns a map of sentences to their classes from a file
func dataset(file string) map[string]string {
	f, err := os.Open(file)
	if err != nil {
		panic(err)
	}
	defer f.Close()

	dataset := make(map[string]string)
	scanner := bufio.NewScanner(f)
	for scanner.Scan() {
		l := scanner.Text()
		data := strings.Split(l, "\t")
		if len(data) != 2 {
			continue
		}
		sentence := data[0]
		if data[1] == "0" {
			dataset[sentence] = negative
		} else if data[1] == "1" {
			dataset[sentence] = positive
		}
	}

	if err := scanner.Err(); err != nil {
		log.Fatal(err)
	}
	return dataset
}

func main() {
	// Initialize a new classifier
	nb := newClassifier()
	// Get dataset from a text file
	// Dataset can be downloaded from https://archive.ics.uci.edu/ml/datasets/Sentiment+Labelled+Sentences
	dataset := dataset("./sentiment labelled sentences/yelp_labelled.txt")
	// Train the classifier with dataset
	nb.train(dataset)

	// Prompt for inputs from console
	reader := bufio.NewReader(os.Stdin)
	for {
		fmt.Print("Enter your review: ")
		sentence, _ := reader.ReadString('\n')
		// Classify input sentence
		result := nb.classify(sentence)
		class := ""
		if result[positive] > result[negative] {
			class = positive
		} else {
			class = negative
		}
		fmt.Printf("> Your review is %s\n\n", class)
	}
}

## Naivebayes.go
package main

// The string values of the 2 classes
// They can be "positive" >< "negative" as in this example
// They can also be "ham" >< "spam", i.e.
const (
	positive = "positive"
	negative = "negative"
)

/*
 * Classifier
 */

// wordFrequency stores frequency of words. For example:
// wordFrequency{
//      word: "excellent"
//	counter: map[string]int{
//		"positive": 15
//		"negative": 0
//	}
// }
type wordFrequency struct {
	word    string
	counter map[string]int
}

// classifier can be trained and used to categorize objects
// Attributes:
//	dataset: map each class with a list of  sentences from training data
//		map[string][]string{
//			"positive": []string{
//				"The restaurant is excellent",
//				"I really love this restaurant",
//			},
//			"negative": []string{
//				"Their food is awful",
//			}
//
//		}
//	words: map each word with their frequency
//		map[string]wordFrequency{
//			"restaurant": wordFrequency{
//				word: "restaurant"
//				counter: map[string]int{
//					"positive": 2
//					"negative": 0
//				}
// 			}
//		}
type classifier struct {
	dataset map[string][]string
	words   map[string]wordFrequency
}

// newClassifier returns a new classifier with empty dataset and words
func newClassifier() *classifier {
	c := new(classifier)
	c.dataset = map[string][]string{
		positive: []string{},
		negative: []string{},
	}
	c.words = map[string]wordFrequency{}
	return c
}

// train populates a classifier's dataset and words with input dataset map
// Sample dataset: map[string]string{
//	"The restaurant is excellent": "Positive",
//	"I really love this restaurant": "Positive",
//	"Their food is awful": "Negative",
//}
func (c *classifier) train(dataset map[string]string) {
	for sentence, class := range dataset {
		c.addSentence(sentence, class)
		words := tokenize(sentence)
		for _, w := range words {
			c.addWord(w, class)
		}
	}
}

// classify return the probablitities of a sentence being each class
// Sample @return map[string]float64 {
//	"positive": 0.7,
//	"negative": 0.1,
//}
// Meaning 70% chance the input sentence is positive, 10% it's negative
func (c classifier) classify(sentence string) map[string]float64 {
	words := tokenize(sentence)
	posProb := c.probability(words, positive)
	negProb := c.probability(words, negative)
	return map[string]float64{
		positive: posProb,
		negative: negProb,
	}
}

// addSentence adds a sentence and its class to a classifier's dataset map
func (c *classifier) addSentence(sentence, class string) {
	c.dataset[class] = append(c.dataset[class], sentence)
}

// addSentence adds a word to a classifier's words map and update its frequency
func (c *classifier) addWord(word, class string) {
	wf, ok := c.words[word]
	if !ok {
		wf = wordFrequency{word: word, counter: map[string]int{
			positive: 0,
			negative: 0,
		}}
	}
	wf.counter[class]++
	c.words[word] = wf
}

// priorProb returns the prior probability of each class of the classifier
// This probability is determined purely by the training dataset
func (c classifier) priorProb(class string) float64 {
	return float64(len(c.dataset[class])) / float64(len(c.dataset[positive])+len(c.dataset[negative]))
}

// totalWordCount returns the word count of a class (duplicated also count)
// If class provided is not positive or negative, it returns
// the total word count in dataset.
func (c classifier) totalWordCount(class string) int {
	posCount := 0
	negCount := 0
	for _, wf := range c.words {
		posCount += wf.counter[positive]
		negCount += wf.counter[negative]
	}
	if class == positive {
		return posCount
	} else if class == negative {
		return negCount
	} else {
		return posCount + negCount
	}
}

// totalDistinctWordCount returns the number of distinct words in dataset
func (c classifier) totalDistinctWordCount() int {
	posCount := 0
	negCount := 0
	for _, wf := range c.words {
		posCount += zeroOneTransform(wf.counter[positive])
		negCount += zeroOneTransform(wf.counter[negative])
	}
	return posCount + negCount
}

// probability retuns the probability of a list of words being in a class
func (c classifier) probability(words []string, class string) float64 {
	prob := c.priorProb(class)
	for _, w := range words {
		count := 0
		if wf, ok := c.words[w]; ok {
			count = wf.counter[class]
		}
		prob *= (float64((count + 1)) / float64((c.totalWordCount(class) + c.totalDistinctWordCount())))
	}
	for _, w := range words {
		count := 0
		if wf, ok := c.words[w]; ok {
			count += (wf.counter[positive] + wf.counter[negative])
		}
		prob /= (float64((count + 1)) / float64((c.totalWordCount("") + c.totalDistinctWordCount())))
	}
	return prob
}

## utilities.go
package main

import (
	"math"
	"regexp"
	"strings"
)

/*
 * Utilities
 */

// stopwords are words which have very little meaning
var stopwords = map[string]struct{}{
	"i": struct{}{}, "me": struct{}{}, "my": struct{}{}, "myself": struct{}{}, "we": struct{}{}, "our": struct{}{}, "ours": struct{}{},
	"ourselves": struct{}{}, "you": struct{}{}, "your": struct{}{}, "yours": struct{}{}, "yourself": struct{}{}, "yourselves": struct{}{},
	"he": struct{}{}, "him": struct{}{}, "his": struct{}{}, "himself": struct{}{}, "she": struct{}{}, "her": struct{}{}, "hers": struct{}{},
	"herself": struct{}{}, "it": struct{}{}, "its": struct{}{}, "itself": struct{}{}, "they": struct{}{}, "them": struct{}{}, "their": struct{}{},
	"theirs": struct{}{}, "themselves": struct{}{}, "what": struct{}{}, "which": struct{}{}, "who": struct{}{}, "whom": struct{}{}, "this": struct{}{},
	"that": struct{}{}, "these": struct{}{}, "those": struct{}{}, "am": struct{}{}, "is": struct{}{}, "are": struct{}{}, "was": struct{}{},
	"were": struct{}{}, "be": struct{}{}, "been": struct{}{}, "being": struct{}{}, "have": struct{}{}, "has": struct{}{}, "had": struct{}{},
	"having": struct{}{}, "do": struct{}{}, "does": struct{}{}, "did": struct{}{}, "doing": struct{}{}, "a": struct{}{}, "an": struct{}{},
	"the": struct{}{}, "and": struct{}{}, "but": struct{}{}, "if": struct{}{}, "or": struct{}{}, "because": struct{}{}, "as": struct{}{},
	"until": struct{}{}, "while": struct{}{}, "of": struct{}{}, "at": struct{}{}, "by": struct{}{}, "for": struct{}{}, "with": struct{}{},
	"about": struct{}{}, "against": struct{}{}, "between": struct{}{}, "into": struct{}{}, "through": struct{}{}, "during": struct{}{},
	"before": struct{}{}, "after": struct{}{}, "above": struct{}{}, "below": struct{}{}, "to": struct{}{}, "from": struct{}{}, "up": struct{}{},
	"down": struct{}{}, "in": struct{}{}, "out": struct{}{}, "on": struct{}{}, "off": struct{}{}, "over": struct{}{}, "under": struct{}{},
	"again": struct{}{}, "further": struct{}{}, "then": struct{}{}, "once": struct{}{}, "here": struct{}{}, "there": struct{}{}, "when": struct{}{},
	"where": struct{}{}, "why": struct{}{}, "how": struct{}{}, "all": struct{}{}, "any": struct{}{}, "both": struct{}{}, "each": struct{}{},
	"few": struct{}{}, "more": struct{}{}, "most": struct{}{}, "other": struct{}{}, "some": struct{}{}, "such": struct{}{}, "no": struct{}{},
	"nor": struct{}{}, "not": struct{}{}, "only": struct{}{}, "same": struct{}{}, "so": struct{}{}, "than": struct{}{}, "too": struct{}{},
	"very": struct{}{}, "can": struct{}{}, "will": struct{}{}, "just": struct{}{}, "don't": struct{}{}, "should": struct{}{}, "should've": struct{}{},
	"now": struct{}{}, "aren't": struct{}{}, "couldn't": struct{}{}, "didn't": struct{}{}, "doesn't": struct{}{}, "hasn't": struct{}{}, "haven't": struct{}{},
	"isn't": struct{}{}, "shouldn't": struct{}{}, "wasn't": struct{}{}, "weren't": struct{}{}, "won't": struct{}{}, "wouldn't": struct{}{},
}

func isStopword(w string) bool {
	_, ok := stopwords[w]
	return ok
}

// cleanup remove none-alnum characters and lowercasize them
func cleanup(sentence string) string {
	re := regexp.MustCompile("[^a-zA-Z 0-9]+")
	return re.ReplaceAllString(strings.ToLower(sentence), "")
}

// tokenize create an array of words from a sentence
func tokenize(sentence string) []string {
	s := cleanup(sentence)
	words := strings.Fields(s)
	var tokens []string
	for _, w := range words {
		if !isStopword(w) {
			tokens = append(tokens, w)
		}
	}
	return tokens
}

// zeroOneTransform returns
//   0 if argument x = 0
//   1 otherwise
func zeroOneTransform(x int) int {
	return int(math.Ceil(float64(x) / (float64(x) + 1.0)))
}
	package main

	import (
	"bufio"
	"fmt"
	"log"
	"os"
	"strings"
	)

	// dataset returns a map of sentences to their classes from a file
	func dataset(file string) map[string]string {
	f, err := os.Open(file)
	if err != nil {
	panic(err)
	}
	defer f.Close()

	dataset := make(map[string]string)
	scanner := bufio.NewScanner(f)
	for scanner.Scan() {
	l := scanner.Text()
	data := strings.Split(l, "\t")
	if len(data) != 2 {
	continue
	}
	sentence := data[0]
	if data[1] == "0" {
	dataset[sentence] = negative
	} else if data[1] == "1" {
	dataset[sentence] = positive
	}
	}

	if err := scanner.Err(); err != nil {
	log.Fatal(err)
	}
	return dataset
	}

	func main() {
	// Initialize a new classifier
	nb := newClassifier()
	// Get dataset from a text file
	// Dataset can be downloaded from https://archive.ics.uci.edu/ml/datasets/Sentiment+Labelled+Sentences
	dataset := dataset("./sentiment labelled sentences/yelp_labelled.txt")
	// Train the classifier with dataset
	nb.train(dataset)

	// Prompt for inputs from console
	reader := bufio.NewReader(os.Stdin)
	for {
	fmt.Print("Enter your review: ")
	sentence, _ := reader.ReadString('\n')
	// Classify input sentence
	result := nb.classify(sentence)
	class := ""
	if result[positive] > result[negative] {
	class = positive
	} else {
	class = negative
	}
	fmt.Printf("> Your review is %s\n\n", class)
	}
	}
	package main

	// The string values of the 2 classes
	// They can be "positive" >< "negative" as in this example
	// They can also be "ham" >< "spam", i.e.
	const (
	positive = "positive"
	negative = "negative"
	)

	/*
	* Classifier
	*/

	// wordFrequency stores frequency of words. For example:
	// wordFrequency{
	// word: "excellent"
	// counter: map[string]int{
	// "positive": 15
	// "negative": 0
	// }
	// }
	type wordFrequency struct {
	word string
	counter map[string]int
	}

	// classifier can be trained and used to categorize objects
	// Attributes:
	// dataset: map each class with a list of sentences from training data
	// map[string][]string{
	// "positive": []string{
	// "The restaurant is excellent",
	// "I really love this restaurant",
	// },
	// "negative": []string{
	// "Their food is awful",
	// }
	//
	// }
	// words: map each word with their frequency
	// map[string]wordFrequency{
	// "restaurant": wordFrequency{
	// word: "restaurant"
	// counter: map[string]int{
	// "positive": 2
	// "negative": 0
	// }
	// }
	// }
	type classifier struct {
	dataset map[string][]string
	words map[string]wordFrequency
	}

	// newClassifier returns a new classifier with empty dataset and words
	func newClassifier() *classifier {
	c := new(classifier)
	c.dataset = map[string][]string{
	positive: []string{},
	negative: []string{},
	}
	c.words = map[string]wordFrequency{}
	return c
	}

	// train populates a classifier's dataset and words with input dataset map
	// Sample dataset: map[string]string{
	// "The restaurant is excellent": "Positive",
	// "I really love this restaurant": "Positive",
	// "Their food is awful": "Negative",
	//}
	func (c *classifier) train(dataset map[string]string) {
	for sentence, class := range dataset {
	c.addSentence(sentence, class)
	words := tokenize(sentence)
	for _, w := range words {
	c.addWord(w, class)
	}
	}
	}

	// classify return the probablitities of a sentence being each class
	// Sample @return map[string]float64 {
	// "positive": 0.7,
	// "negative": 0.1,
	//}
	// Meaning 70% chance the input sentence is positive, 10% it's negative
	func (c classifier) classify(sentence string) map[string]float64 {
	words := tokenize(sentence)
	posProb := c.probability(words, positive)
	negProb := c.probability(words, negative)
	return map[string]float64{
	positive: posProb,
	negative: negProb,
	}
	}

	// addSentence adds a sentence and its class to a classifier's dataset map
	func (c *classifier) addSentence(sentence, class string) {
	c.dataset[class] = append(c.dataset[class], sentence)
	}

	// addSentence adds a word to a classifier's words map and update its frequency
	func (c *classifier) addWord(word, class string) {
	wf, ok := c.words[word]
	if !ok {
	wf = wordFrequency{word: word, counter: map[string]int{
	positive: 0,
	negative: 0,
	}}
	}
	wf.counter[class]++
	c.words[word] = wf
	}

	// priorProb returns the prior probability of each class of the classifier
	// This probability is determined purely by the training dataset
	func (c classifier) priorProb(class string) float64 {
	return float64(len(c.dataset[class])) / float64(len(c.dataset[positive])+len(c.dataset[negative]))
	}

	// totalWordCount returns the word count of a class (duplicated also count)
	// If class provided is not positive or negative, it returns
	// the total word count in dataset.
	func (c classifier) totalWordCount(class string) int {
	posCount := 0
	negCount := 0
	for _, wf := range c.words {
	posCount += wf.counter[positive]
	negCount += wf.counter[negative]
	}
	if class == positive {
	return posCount
	} else if class == negative {
	return negCount
	} else {
	return posCount + negCount
	}
	}

	// totalDistinctWordCount returns the number of distinct words in dataset
	func (c classifier) totalDistinctWordCount() int {
	posCount := 0
	negCount := 0
	for _, wf := range c.words {
	posCount += zeroOneTransform(wf.counter[positive])
	negCount += zeroOneTransform(wf.counter[negative])
	}
	return posCount + negCount
	}

	// probability retuns the probability of a list of words being in a class
	func (c classifier) probability(words []string, class string) float64 {
	prob := c.priorProb(class)
	for _, w := range words {
	count := 0
	if wf, ok := c.words[w]; ok {
	count = wf.counter[class]
	}
	prob *= (float64((count + 1)) / float64((c.totalWordCount(class) + c.totalDistinctWordCount())))
	}
	for _, w := range words {
	count := 0
	if wf, ok := c.words[w]; ok {
	count += (wf.counter[positive] + wf.counter[negative])
	}
	prob /= (float64((count + 1)) / float64((c.totalWordCount("") + c.totalDistinctWordCount())))
	}
	return prob
	}
	package main

	import (
	"math"
	"regexp"
	"strings"
	)

	/*
	* Utilities
	*/

	// stopwords are words which have very little meaning
	var stopwords = map[string]struct{}{
	"i": struct{}{}, "me": struct{}{}, "my": struct{}{}, "myself": struct{}{}, "we": struct{}{}, "our": struct{}{}, "ours": struct{}{},
	"ourselves": struct{}{}, "you": struct{}{}, "your": struct{}{}, "yours": struct{}{}, "yourself": struct{}{}, "yourselves": struct{}{},
	"he": struct{}{}, "him": struct{}{}, "his": struct{}{}, "himself": struct{}{}, "she": struct{}{}, "her": struct{}{}, "hers": struct{}{},
	"herself": struct{}{}, "it": struct{}{}, "its": struct{}{}, "itself": struct{}{}, "they": struct{}{}, "them": struct{}{}, "their": struct{}{},
	"theirs": struct{}{}, "themselves": struct{}{}, "what": struct{}{}, "which": struct{}{}, "who": struct{}{}, "whom": struct{}{}, "this": struct{}{},
	"that": struct{}{}, "these": struct{}{}, "those": struct{}{}, "am": struct{}{}, "is": struct{}{}, "are": struct{}{}, "was": struct{}{},
	"were": struct{}{}, "be": struct{}{}, "been": struct{}{}, "being": struct{}{}, "have": struct{}{}, "has": struct{}{}, "had": struct{}{},
	"having": struct{}{}, "do": struct{}{}, "does": struct{}{}, "did": struct{}{}, "doing": struct{}{}, "a": struct{}{}, "an": struct{}{},
	"the": struct{}{}, "and": struct{}{}, "but": struct{}{}, "if": struct{}{}, "or": struct{}{}, "because": struct{}{}, "as": struct{}{},
	"until": struct{}{}, "while": struct{}{}, "of": struct{}{}, "at": struct{}{}, "by": struct{}{}, "for": struct{}{}, "with": struct{}{},
	"about": struct{}{}, "against": struct{}{}, "between": struct{}{}, "into": struct{}{}, "through": struct{}{}, "during": struct{}{},
	"before": struct{}{}, "after": struct{}{}, "above": struct{}{}, "below": struct{}{}, "to": struct{}{}, "from": struct{}{}, "up": struct{}{},
	"down": struct{}{}, "in": struct{}{}, "out": struct{}{}, "on": struct{}{}, "off": struct{}{}, "over": struct{}{}, "under": struct{}{},
	"again": struct{}{}, "further": struct{}{}, "then": struct{}{}, "once": struct{}{}, "here": struct{}{}, "there": struct{}{}, "when": struct{}{},
	"where": struct{}{}, "why": struct{}{}, "how": struct{}{}, "all": struct{}{}, "any": struct{}{}, "both": struct{}{}, "each": struct{}{},
	"few": struct{}{}, "more": struct{}{}, "most": struct{}{}, "other": struct{}{}, "some": struct{}{}, "such": struct{}{}, "no": struct{}{},
	"nor": struct{}{}, "not": struct{}{}, "only": struct{}{}, "same": struct{}{}, "so": struct{}{}, "than": struct{}{}, "too": struct{}{},
	"very": struct{}{}, "can": struct{}{}, "will": struct{}{}, "just": struct{}{}, "don't": struct{}{}, "should": struct{}{}, "should've": struct{}{},
	"now": struct{}{}, "aren't": struct{}{}, "couldn't": struct{}{}, "didn't": struct{}{}, "doesn't": struct{}{}, "hasn't": struct{}{}, "haven't": struct{}{},
	"isn't": struct{}{}, "shouldn't": struct{}{}, "wasn't": struct{}{}, "weren't": struct{}{}, "won't": struct{}{}, "wouldn't": struct{}{},
	}

	func isStopword(w string) bool {
	_, ok := stopwords[w]
	return ok
	}

	// cleanup remove none-alnum characters and lowercasize them
	func cleanup(sentence string) string {
	re := regexp.MustCompile("[^a-zA-Z 0-9]+")
	return re.ReplaceAllString(strings.ToLower(sentence), "")
	}

	// tokenize create an array of words from a sentence
	func tokenize(sentence string) []string {
	s := cleanup(sentence)
	words := strings.Fields(s)
	var tokens []string
	for _, w := range words {
	if !isStopword(w) {
	tokens = append(tokens, w)
	}
	}
	return tokens
	}

	// zeroOneTransform returns
	// 0 if argument x = 0
	// 1 otherwise
	func zeroOneTransform(x int) int {
	return int(math.Ceil(float64(x) / (float64(x) + 1.0)))
	}