lynaghk/gist:790631

## gistfile1.r
#I'm using two non-standard packages; one for nearest neighbors (FNN) and one for doing split-combine-apply (think map reduce) type operations.
#you'll have to run this: install.packages(c('FNN', 'plyr'))
#you can also checkout documentation in R using the question mark; run this: ?knn
library(FNN)
library(plyr)

#Here are some random points on the plane to show you the interface
neartop = ldply(1:50, function(i){
  c(  x = rnorm(1)
    , y = 1 + rnorm(1, sd=0.5))
})

nearbottom = ldply(1:50, function(i){
  c(  x = rnorm(1)
    , y = -1 + rnorm(1, sd=0.5))
})

# the first 50 are near the top, the rest are near the bottom
points = rbind(neartop, nearbottom)

#the vector of classification labels is easy to construct. Since the labels are TRUE/FALSE, the question we're asking is, "is this point in the set near the top?"
points[,'label'] = (1:100 < 51)


plot(points$x, points$y)

#split into training/testing sets by sampling 60 numbers from the set of 1--100, using them for training, and the rest for testing.
train_i = sample(1:100, 60)
train = points[train_i,c('x','y')]
test = points[-train_i,c('x','y')]

results = knn(
  train
  , test
  , cpt_vecs
  , cl = points[train_i,'label']
  , k=1) #k = 1 just assigns a point to its nearest neighbor.

#percent correct
sum(points[-train_i,'label'] == results) / nrow(test)
	#I'm using two non-standard packages; one for nearest neighbors (FNN) and one for doing split-combine-apply (think map reduce) type operations.
	#you'll have to run this: install.packages(c('FNN', 'plyr'))
	#you can also checkout documentation in R using the question mark; run this: ?knn
	library(FNN)
	library(plyr)

	#Here are some random points on the plane to show you the interface
	neartop = ldply(1:50, function(i){
	c( x = rnorm(1)
	, y = 1 + rnorm(1, sd=0.5))
	})

	nearbottom = ldply(1:50, function(i){
	c( x = rnorm(1)
	, y = -1 + rnorm(1, sd=0.5))
	})

	# the first 50 are near the top, the rest are near the bottom
	points = rbind(neartop, nearbottom)

	#the vector of classification labels is easy to construct. Since the labels are TRUE/FALSE, the question we're asking is, "is this point in the set near the top?"
	points[,'label'] = (1:100 < 51)


	plot(points$x, points$y)

	#split into training/testing sets by sampling 60 numbers from the set of 1--100, using them for training, and the rest for testing.
	train_i = sample(1:100, 60)
	train = points[train_i,c('x','y')]
	test = points[-train_i,c('x','y')]

	results = knn(
	train
	, test
	, cpt_vecs
	, cl = points[train_i,'label']
	, k=1) #k = 1 just assigns a point to its nearest neighbor.

	#percent correct
	sum(points[-train_i,'label'] == results) / nrow(test)