Skip to content

Instantly share code, notes, and snippets.

@sarthakpranesh
Created November 23, 2020 11:51
Show Gist options
  • Save sarthakpranesh/21482a0d5363fe9a252934de36f66396 to your computer and use it in GitHub Desktop.
Save sarthakpranesh/21482a0d5363fe9a252934de36f66396 to your computer and use it in GitHub Desktop.
Language Classification using nnGo
package main
import (
"encoding/csv"
"fmt"
"os"
"github.com/sarthakpranesh/nnGo"
)
const (
testIndex = 122
ngram = 5
)
var (
langs = []string{"hindi", "tamil", "telugu", "marathi", "kannada"}
langsHot = map[string][]float64{
"hindi": []float64{1, 0, 0, 0, 0},
"tamil": []float64{0, 1, 0, 0, 0},
"telugu": []float64{0, 0, 1, 0, 0},
"marathi": []float64{0, 0, 0, 1, 0},
"kannada": []float64{0, 0, 0, 0, 1},
}
)
type RawData struct {
titles []string
langs []string
}
type GramData struct {
titles [][]string
langs []string
}
// LoadCsv helps in importing csv data into a RawData type
func LoadCsv(name string) (RawData, error) {
f, err := os.Open(name)
if err != nil {
return RawData{}, err
}
defer f.Close()
lines, err := csv.NewReader(f).ReadAll()
if err != nil {
return RawData{}, err
}
var titles []string
var langs []string
for i, line := range lines {
if i == 0 {
continue
}
titles = append(titles, line[0])
langs = append(langs, line[1])
}
return RawData{
titles: titles,
langs: langs,
}, nil
}
// Tokenize helps convert the string tweet into a list of words
func NGramConvertion(rawData RawData) GramData {
gramData := GramData{
langs: rawData.langs,
}
for _, title := range rawData.titles {
var gram []string
for i := ngram; i <= len(title); i++ {
gram = append(gram, title[i-ngram:i])
}
gramData.titles = append(gramData.titles, gram)
}
return gramData
}
// CreateDictionaries creates two dictionaries, one for positive (non hate speech) and the other for negative (hate speech)
func CreateDictionaries(gramData GramData) map[string]map[string]float64 {
dict := make(map[string]map[string]float64)
for _, val := range langs {
dict[val] = make(map[string]float64)
}
for i, title := range gramData.titles {
for _, gram := range title {
val, _ := dict[gramData.langs[i]][gram]
val = val + 1
dict[gramData.langs[i]][gram] = val
}
}
return dict
}
// NormalizeDictionary converts the given map into a probabilistic map
func NormalizeDictionary(dict map[string]float64) map[string]float64 {
var max float64
for _, v := range dict {
if max < v {
max = v
}
}
for k, v := range dict {
dict[k] = v / max
}
return dict
}
func GenerateNNData(gramData GramData, dicts map[string]map[string]float64) ([][]float64, [][]float64) {
var tData, tLabel [][]float64
for i, title := range gramData.titles {
tLabel = append(tLabel, langsHot[gramData.langs[i]])
inputData := []float64{1, 0, 0, 0, 0, 0}
for _, gram := range title {
for i, val := range langs {
inputData[i+1] = inputData[i+1] + dicts[val][gram]
}
}
tData = append(tData, inputData)
}
return tData, tLabel
}
func main() {
// loading csv data
rawCsvData, err := LoadCsv("combined.csv")
if err != nil {
fmt.Println(err)
return
}
fmt.Println("Title:", rawCsvData.titles[testIndex], "\tLanguage:", rawCsvData.langs[testIndex])
// Tokenize raw data
gramData := NGramConvertion(rawCsvData)
fmt.Println("Title:", gramData.titles[testIndex], "\tLanguage:", gramData.langs[testIndex])
// Create Dictionaries
dicts := CreateDictionaries(gramData)
// Normalizing Dictionaries - creating probabilities
for _, val := range langs {
dicts[val] = NormalizeDictionary(dicts[val])
}
movieTitles, movieLangs := GenerateNNData(gramData, dicts)
fmt.Println("Title:", movieTitles[testIndex], "\tLanguage:", movieLangs[testIndex])
trainData := movieTitles[:5000]
trainLabel := movieLangs[:5000]
testData := movieTitles[5000:]
testLabel := movieLangs[5000:]
// Neural Network Model
nn := nnGo.NewNN(6, 100, len(langs), 0.00000006, "sgd", 2000)
nn.Train(trainData, trainLabel)
fmt.Println("Actual Encoded Movie Title:", testData[21])
fmt.Println("Actual Language:", testLabel[21])
nn.Predict(testData[21])
fmt.Println("Actual Encoded Movie Title:", testData[22])
fmt.Println("Actual Language:", testLabel[22])
nn.Predict(testData[22])
var correct, wrong float64
for i, val := range testData {
tLabel := testLabel[i]
pred := nn.Predict(val)
var max float64
var maxIndex int
for i, val := range pred {
if val[0] > max {
max = val[0]
maxIndex = i
}
}
if tLabel[maxIndex] == 1 {
correct++
} else {
wrong++
}
}
fmt.Println("Number of Correct results:", correct)
fmt.Println("Number of Wrong results:", wrong)
fmt.Println("Accuracy:", correct*100/(correct+wrong))
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment