amitkgupta/goodlearn-knn-perf.svg

## goodlearn-knn-perf.svg

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              goodlearn-knn-perf.svg
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## goodlearn-knn.go
package main

import (
	"fmt"
	"github.com/amitkgupta/goodlearn/classifier/knn"
	"github.com/amitkgupta/goodlearn/csvparse"
	"github.com/amitkgupta/goodlearn/data/row"
	"runtime"

	"flag"
	"log"
	"os"
	"runtime/pprof"
)

var cpuprofile = flag.String("cpuprofile", "", "write cpu profile to file")

func main() {
	flag.Parse()
	if *cpuprofile != "" {
		f, err := os.Create(*cpuprofile)
		if err != nil {
			log.Fatal(err)
		}
		pprof.StartCPUProfile(f)
		defer pprof.StopCPUProfile()
	}
	runtime.GOMAXPROCS(runtime.NumCPU())

	println("parsing test")
	validationSample, _ := csvparse.DatasetFromPath("many_features_test.csv", 0, 1)
	println("parsed test")
	println("parsing training")
	trainingSample, _ := csvparse.DatasetFromPath("many_features_training.csv", 0, 1)
	println("parsed training")

	c, _ := knn.NewKNNClassifier(1)
	c.Train(trainingSample)

	var totalCorrect float32 = 0
	successChannel := make(chan float32, 10000) //len(validationSample))

	for i := 0; i < 10000; i++ {
		test, _ := validationSample.Row(i)
		go func(t row.Row, j int) {
			if j%5 == 0 {
				println("classifying", j)
			}
			cl, _ := c.Classify(test)
			if cl.Equals(test.Target()) {
				successChannel <- 1
			} else {
				successChannel <- 0
			}
			if j%5 == 0 {
				println("classified", j)
			}
		}(test, i)
	}

	for i := 0; i < 10000; i++ { //len(validationSample); i++ {
		totalCorrect += <-successChannel
	}

	fmt.Println(float32(totalCorrect)) // / float32(len(validationSample)))
}

// Takes about 50s on the beefy machine, *with* goodlearn optimized
// - replace math.Pow(x,2) with x*x in distance.go
// - SliceFromStrings was super slow, but assuming contiguous features and all of them are floats, much faster
// Outputs unreasonable answer: 10000

## raw-go-knn-perf.svg

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              raw-go-knn-perf.svg
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## raw-go-knn.go
package main

import (
	"bytes"
	"flag"
	"fmt"
	"io/ioutil"
	"log"
	"math"
	"os"
	"runtime"
	"runtime/pprof"
	"strconv"
)

type LabelWithFeatures struct {
	Label    []byte
	Features []float64
}

func NewLabelWithFeatures(parsedLine [][]byte) LabelWithFeatures {
	label := parsedLine[0]
	features := make([]float64, len(parsedLine)-1)

	for i, feature := range parsedLine {
		// skip label
		if i == 0 {
			continue
		}

		features[i-1] = byteSliceTofloat64(feature)
	}

	return LabelWithFeatures{label, features}
}

var newline = []byte("\n")
var comma = []byte(",")

func byteSliceTofloat64(b []byte) float64 {
	x, _ := strconv.ParseFloat(string(b), 32)
	return float64(x)
}

func parseCSVFile(filePath string) []LabelWithFeatures {
	fileContent, _ := ioutil.ReadFile(filePath)
	lines := bytes.Split(fileContent, newline)
	numRows := len(lines)

	labelsWithFeatures := make([]LabelWithFeatures, numRows-2)

	for i, line := range lines {
		// skip headers
		if i == 0 || i == numRows-1 {
			continue
		}

		labelsWithFeatures[i-1] = NewLabelWithFeatures(bytes.Split(line, comma))
	}

	return labelsWithFeatures
}

func squareDistanceWithBailout(features1, features2 []float64, bailout float64) (d float64) {
	for i := 0; i < len(features1); i++ {
		x := features1[i] - features2[i]
		d += x * x

		if d > bailout {
			break
		}
	}

	return
}

var trainingSample = parseCSVFile("many_features_training.csv")

func classify(features []float64) (label []byte) {
	label = trainingSample[0].Label
	d := squareDistanceWithBailout(features, trainingSample[0].Features, math.MaxFloat32)

	for _, row := range trainingSample {
		dNew := squareDistanceWithBailout(features, row.Features, d)

		if dNew < d {
			label = row.Label
			d = dNew
		}
	}

	return
}

var cpuprofile = flag.String("cpuprofile", "", "write cpu profile to file")

func main() {
	flag.Parse()
	if *cpuprofile != "" {
		f, err := os.Create(*cpuprofile)
		if err != nil {
			log.Fatal(err)
		}
		pprof.StartCPUProfile(f)
		defer pprof.StopCPUProfile()
	}
	runtime.GOMAXPROCS(runtime.NumCPU())

	println("parsing test")
	validationSample := parseCSVFile("many_features_test.csv")
	println("parsed test")

	var totalCorrect float64 = 0
	successChannel := make(chan float64, 10000) //len(validationSample))

	for i := 0; i < 10000; i++ {
		test := validationSample[i]
		go func(t LabelWithFeatures, j int) {
			if j%5 == 0 {
				println("classifying", j)
			}
			if string(t.Label) == string(classify(t.Features)) {
				successChannel <- 1
			} else {
				successChannel <- 0
			}
			if j%5 == 0 {
				println("classified", j)
			}
		}(test, i)
	}

	for i := 0; i < 10000; i++ { //len(validationSample); i++ {
		totalCorrect += <-successChannel
	}

	fmt.Println(float64(totalCorrect)) // / float64(len(validationSample)))
}

// Runs in about 10s on the beefy machine
// Has some reasonable number of correct, like 98k

## scikit-knn.py
import numpy
from sklearn.neighbors import KNeighborsClassifier

f = open("many_features_test.csv")
f.readline() # ignore headers
test = numpy.loadtxt(f, delimiter=',')

ff = open("many_features_training.csv")
ff.readline() # ignore headers
training = numpy.loadtxt(ff, delimiter=',')

knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(training[:, [1,128]], numpy.ravel(training[:, [0]]))
numpy.sum(knn.predict(test[:, [1,128]]) == numpy.ravel(test[:, [0]]))
# Lightning fast, but...
# Something shitty, like 534 (out of ~43000)

knn = KNeighborsClassifier(n_neighbors=1, algorithm='brute')
knn.fit(training[:, [1,128]], numpy.ravel(training[:, [0]]))
numpy.sum(knn.predict(test[:, [1,128]]) == numpy.ravel(test[:, [0]]))
# Takes a while, and then...
# Something shittier, like MemoryError
	package main

	import (
	"fmt"
	"github.com/amitkgupta/goodlearn/classifier/knn"
	"github.com/amitkgupta/goodlearn/csvparse"
	"github.com/amitkgupta/goodlearn/data/row"
	"runtime"

	"flag"
	"log"
	"os"
	"runtime/pprof"
	)

	var cpuprofile = flag.String("cpuprofile", "", "write cpu profile to file")

	func main() {
	flag.Parse()
	if *cpuprofile != "" {
	f, err := os.Create(*cpuprofile)
	if err != nil {
	log.Fatal(err)
	}
	pprof.StartCPUProfile(f)
	defer pprof.StopCPUProfile()
	}
	runtime.GOMAXPROCS(runtime.NumCPU())

	println("parsing test")
	validationSample, _ := csvparse.DatasetFromPath("many_features_test.csv", 0, 1)
	println("parsed test")
	println("parsing training")
	trainingSample, _ := csvparse.DatasetFromPath("many_features_training.csv", 0, 1)
	println("parsed training")

	c, _ := knn.NewKNNClassifier(1)
	c.Train(trainingSample)

	var totalCorrect float32 = 0
	successChannel := make(chan float32, 10000) //len(validationSample))

	for i := 0; i < 10000; i++ {
	test, _ := validationSample.Row(i)
	go func(t row.Row, j int) {
	if j%5 == 0 {
	println("classifying", j)
	}
	cl, _ := c.Classify(test)
	if cl.Equals(test.Target()) {
	successChannel <- 1
	} else {
	successChannel <- 0
	}
	if j%5 == 0 {
	println("classified", j)
	}
	}(test, i)
	}

	for i := 0; i < 10000; i++ { //len(validationSample); i++ {
	totalCorrect += <-successChannel
	}

	fmt.Println(float32(totalCorrect)) // / float32(len(validationSample)))
	}

	// Takes about 50s on the beefy machine, with goodlearn optimized
	// - replace math.Pow(x,2) with x*x in distance.go
	// - SliceFromStrings was super slow, but assuming contiguous features and all of them are floats, much faster
	// Outputs unreasonable answer: 10000
	package main

	import (
	"bytes"
	"flag"
	"fmt"
	"io/ioutil"
	"log"
	"math"
	"os"
	"runtime"
	"runtime/pprof"
	"strconv"
	)

	type LabelWithFeatures struct {
	Label []byte
	Features []float64
	}

	func NewLabelWithFeatures(parsedLine [][]byte) LabelWithFeatures {
	label := parsedLine[0]
	features := make([]float64, len(parsedLine)-1)

	for i, feature := range parsedLine {
	// skip label
	if i == 0 {
	continue
	}

	features[i-1] = byteSliceTofloat64(feature)
	}

	return LabelWithFeatures{label, features}
	}

	var newline = []byte("\n")
	var comma = []byte(",")

	func byteSliceTofloat64(b []byte) float64 {
	x, _ := strconv.ParseFloat(string(b), 32)
	return float64(x)
	}

	func parseCSVFile(filePath string) []LabelWithFeatures {
	fileContent, _ := ioutil.ReadFile(filePath)
	lines := bytes.Split(fileContent, newline)
	numRows := len(lines)

	labelsWithFeatures := make([]LabelWithFeatures, numRows-2)

	for i, line := range lines {
	// skip headers
	if i == 0 \|\| i == numRows-1 {
	continue
	}

	labelsWithFeatures[i-1] = NewLabelWithFeatures(bytes.Split(line, comma))
	}

	return labelsWithFeatures
	}

	func squareDistanceWithBailout(features1, features2 []float64, bailout float64) (d float64) {
	for i := 0; i < len(features1); i++ {
	x := features1[i] - features2[i]
	d += x * x

	if d > bailout {
	break
	}
	}

	return
	}

	var trainingSample = parseCSVFile("many_features_training.csv")

	func classify(features []float64) (label []byte) {
	label = trainingSample[0].Label
	d := squareDistanceWithBailout(features, trainingSample[0].Features, math.MaxFloat32)

	for _, row := range trainingSample {
	dNew := squareDistanceWithBailout(features, row.Features, d)

	if dNew < d {
	label = row.Label
	d = dNew
	}
	}

	return
	}

	var cpuprofile = flag.String("cpuprofile", "", "write cpu profile to file")

	func main() {
	flag.Parse()
	if *cpuprofile != "" {
	f, err := os.Create(*cpuprofile)
	if err != nil {
	log.Fatal(err)
	}
	pprof.StartCPUProfile(f)
	defer pprof.StopCPUProfile()
	}
	runtime.GOMAXPROCS(runtime.NumCPU())

	println("parsing test")
	validationSample := parseCSVFile("many_features_test.csv")
	println("parsed test")

	var totalCorrect float64 = 0
	successChannel := make(chan float64, 10000) //len(validationSample))

	for i := 0; i < 10000; i++ {
	test := validationSample[i]
	go func(t LabelWithFeatures, j int) {
	if j%5 == 0 {
	println("classifying", j)
	}
	if string(t.Label) == string(classify(t.Features)) {
	successChannel <- 1
	} else {
	successChannel <- 0
	}
	if j%5 == 0 {
	println("classified", j)
	}
	}(test, i)
	}

	for i := 0; i < 10000; i++ { //len(validationSample); i++ {
	totalCorrect += <-successChannel
	}

	fmt.Println(float64(totalCorrect)) // / float64(len(validationSample)))
	}

	// Runs in about 10s on the beefy machine
	// Has some reasonable number of correct, like 98k
	import numpy
	from sklearn.neighbors import KNeighborsClassifier

	f = open("many_features_test.csv")
	f.readline() # ignore headers
	test = numpy.loadtxt(f, delimiter=',')

	ff = open("many_features_training.csv")
	ff.readline() # ignore headers
	training = numpy.loadtxt(ff, delimiter=',')

	knn = KNeighborsClassifier(n_neighbors=1)
	knn.fit(training[:, [1,128]], numpy.ravel(training[:, [0]]))
	numpy.sum(knn.predict(test[:, [1,128]]) == numpy.ravel(test[:, [0]]))
	# Lightning fast, but...
	# Something shitty, like 534 (out of ~43000)

	knn = KNeighborsClassifier(n_neighbors=1, algorithm='brute')
	knn.fit(training[:, [1,128]], numpy.ravel(training[:, [0]]))
	numpy.sum(knn.predict(test[:, [1,128]]) == numpy.ravel(test[:, [0]]))
	# Takes a while, and then...
	# Something shittier, like MemoryError