xccds/gbm.R

## gbm.R
# Learn Gradient Boosting Model by coding

# good slide
# http://www.ccs.neu.edu/home/vip/teach/MLcourse/4_boosting/slides/gradient_boosting.pdf
# http://cran.r-project.org/web/packages/gbm/gbm.pdf

#1 Gradient Boosting for Regression

# generate data
generate_func = function(n=1000,size=0.5){
  # n: how many data points
  # size: hold 50% of all data as train
  set.seed(1)
  x = seq(0,2*pi,length=n) # generate x
  noise = rnorm(n,0,0.3)
  y = sin(x) + noise # generate y
  select_index = sample(1:n,size=size*n,replace = F)
  # split data into train and test
  train_x = x[select_index]
  train_y = y[select_index]
  test_x = x[-select_index]
  test_y = y[-select_index]
  data = list('train_x'=train_x,
             'train_y'=train_y,
             'test_x'=test_x,
             'test_y'=test_y)
  return(data)
}

data = generate_func()
objects(data)

# train boosting regression tree
GBR = function(x,y,rate=1,iter=100){
  # iter is iterate number, higher is better, but be carefule overfit.
  # rate is learning speed, lower is better, but too slow will cause longer time.
  library(rpart)
  max_depth=1 # tree stump
  mu = mean(y) # start with an initial model
  dy = y - mu  # residuals or error,  These are the parts that existing model cannot do well.
  learner = list() # define a learners holder
  for (i in 1:iter) {
    # use a weak model to improve
    learner[[i]] = rpart(dy~x, # error is target variable
                         control=rpart.control(maxdepth=max_depth,cp=0)) # cp=0 means to growth a tree with any cost
    # modify residuals for next iter
    dy = dy - rate*predict(learner[[i]])
  }
  result = list('learner'=learner,
                'rate'=rate,
                'iter'=iter)
  return(result)
}


model = GBR(data$train_x,data$train_y,iter=1000)
objects(model)

# predict function
predict_GBR = function(x,y,model){
  predict_y = list() # hold predict result
  mu = mean(y) # initial model
  n = length(y)
  iter = model$iter
  rate = model$rate
  predict_y[[1]] = rep(mu,n)
  learner = model$learner
  for (i in 2:iter) {
    # add pre-model prediction to predict y
    predict_y[[i]] = predict_y[[i-1]] + rate * predict(learner[[i-1]],newdata=data.frame(x))
  }
  # mean sqare error
  mse = sapply(predict_y,function(pre_y) round(mean((y-pre_y)^2),3))
  result = list('predict_y'=predict_y,
                'mse'= mse)
  return(result)
}

# predict train data
predict_train = predict_GBR(data$train_x,data$train_y,model=model)
objects(predict_train)

# more trees get less error
plot(predict_train$mse,type='l')


# plot the effect of boosing tree
plotfunc = function(x,y,predict,num){
  library(ggplot2)
  pre = predict$predict_y[[num]]
  plotdf = data.frame(x=x,y=y,pre = pre)
  mse = round(mean((y-pre)^2),3)
  p = ggplot(plotdf,aes(x,y)) +
    geom_point(alpha=0.5)
  p = p + geom_line(aes(x,pre)) + xlab(paste0('mse=',mse))
  plot(p)
}

# use mean of y to predict data
plotfunc(data$train_x,data$train_y,predict_train,num=1)
# add another tree model
plotfunc(data$train_x,data$train_y,predict_train,num=2)
# add more tree getter more result
plotfunc(data$train_x,data$train_y,predict_train,num=10)
plotfunc(data$train_x,data$train_y,predict_train,num=100)

# predict test data
predict_test = predict_GBR(data$test_x,data$test_y,model=model)
plot(predict_test$mse,type='l')

plotfunc(data$test_x,data$test_y,predict_test,10)
plotfunc(data$test_x,data$test_y,predict_test,100)

# compare different parametter
# create 2 models with different rate
model_1 = GBR(data$train_x,data$train_y,rate=1,iter=500)
model_2 = GBR(data$train_x,data$train_y,rate=0.1,iter=500)

# use train and test data, we have 4 results
predict_train_1 = predict_GBR(data$train_x,data$train_y,model=model_1)
predict_train_2 = predict_GBR(data$train_x,data$train_y,model=model_2)
predict_test_1 = predict_GBR(data$test_x,data$test_y,model=model_1)
predict_test_2 = predict_GBR(data$test_x,data$test_y,model=model_2)

# take out mse of these 4 results
train_error_1 = predict_train_1$mse
train_error_2 = predict_train_2$mse
test_error_1 = predict_test_1$mse
test_error_2 = predict_test_2$mse
iter = 1:model_1$iter

# compare these mse
plotdf = data.frame(iter,train_error_1,test_error_1,train_error_2,test_error_2)

p = ggplot(plotdf)+
    geom_line(aes(x=iter,y=train_error_1),color='blue')+
    geom_line(aes(x=iter,y=test_error_1),color='red') +
  geom_line(aes(x=iter,y=train_error_2),linetype=2,color='blue')+
  geom_line(aes(x=iter,y=test_error_2),linetype=2,color='red')
print(p)

# test error is better model performance.
# less rate is better but need more iter
which.min(train_error_1)
which.min(train_error_2)
which.min(test_error_1)
which.min(test_error_2)

# 2. Gradient Boosting for Classification

# read data
data = subset(iris,Species!='virginica',select = c(1,2,5))
data$y = ifelse(data$Species == 'setosa',1,-1)
data$Species = NULL
names(data)[1:2] = c('x1','x2')
head(data)
p = ggplot(data,aes(x1,x2,color=factor(y)))+
    geom_point()
print(p)

# train boosting tree for classification
GBC = function(data,rate=0.1,iter=100){
  library(rpart)
  max_depth=1
  learner = list()
  mu = mean(data$y==1)
  # start with an initial model
  # mu=p(y=1) -> f=w*x = log(mu/(1-mu))
  f = log(mu/(1-mu))
  data$dy = data$y/(1+exp(data$y*f)) # dy is negtive gradient of log loss funtion
  for (i in 1:iter) {
    # use a weak model to improve
    learner[[i]] = rpart(dy~x1+x2,data=data,
                         control=rpart.control(maxdepth=max_depth,cp=0))
    # improve model
    f = f + rate *predict(learner[[i]])
    # modify dy
    data$dy = data$y/(1+exp(data$y*f))
  }
  result = list('learner'=learner,
                'rate'=rate,
                'iter'=iter)
  return(result)
}

model = GBC(data,rate=0.1,iter=1000)
objects(model)

# predict function
predict_GBC = function(data,model){
  predict_y = list()
  mu = mean(data$y==1)
  f = log(mu/(1-mu))
  n = nrow(data)
  iter = model$iter
  rate = model$rate
  predict_y[[1]] = rep(f,n)
  learner = model$learner
  for (i in 2:iter) {
    predict_y[[i]] = predict_y[[i-1]] + rate *predict(learner[[i-1]],newdata=data)
  }
  mse = sapply(predict_y,function(pre_y) sum(log(1+exp(-data$y*pre_y)))) # logistic loss function
  result = list('predict_y'=predict_y,
                'mse'= mse)
  return(result)
}

# predict data
predict_train = predict_GBC(data,model=model)
objects(predict_train)
plot(predict_train$mse,type='l')

final = predict_train$predict_y[[1000]]
y_p = 1/(1+exp(-final))
y_pred = ifelse(y_p>0.5,1,-1)
table(data$y, y_pred)

# # plot

plotfunc2 = function(data,num,rate=0.1){
  library(ggplot2)
  model = GBC(data,rate=rate,iter=num)
  predict_train = predict_GBC(data,model=model)
  final = predict_train$predict_y[[num]]
  y_p = 1/(1+exp(-final))
  data$y_pre = ifelse(y_p>0.5,1,-1)
  tree_f = sapply(model$learner,function(x){
    temp = x$splits
    return(row.names(temp)[1])
  })
  tree_v = sapply(model$learner,function(x){
    temp = x$splits
    return(temp[1,'index'])
  })
  x1value = tree_v[tree_f=='x1']
  x2value = tree_v[tree_f=='x2']
  p = ggplot(data,aes(x=x1,y=x2))
  p = p + geom_point(aes(color=factor(y_pre)),size=5) +
    geom_point(aes(color=factor(y)),size=3)+
    geom_vline(xintercept = x1value,alpha=0.4) +
    geom_hline(yintercept = x2value,alpha=0.4) +
    scale_colour_brewer(type="seq", palette='Set1') +
    theme_bw()
  print(p)
}

plotfunc2(data,num=10)
plotfunc2(data,num=20)
plotfunc2(data,num=60)
plotfunc2(data,num=100)
plotfunc2(data,num=1000)
	# Learn Gradient Boosting Model by coding

	# good slide
	# http://www.ccs.neu.edu/home/vip/teach/MLcourse/4_boosting/slides/gradient_boosting.pdf
	# http://cran.r-project.org/web/packages/gbm/gbm.pdf

	#1 Gradient Boosting for Regression

	# generate data
	generate_func = function(n=1000,size=0.5){
	# n: how many data points
	# size: hold 50% of all data as train
	set.seed(1)
	x = seq(0,2*pi,length=n) # generate x
	noise = rnorm(n,0,0.3)
	y = sin(x) + noise # generate y
	select_index = sample(1:n,size=size*n,replace = F)
	# split data into train and test
	train_x = x[select_index]
	train_y = y[select_index]
	test_x = x[-select_index]
	test_y = y[-select_index]
	data = list('train_x'=train_x,
	'train_y'=train_y,
	'test_x'=test_x,
	'test_y'=test_y)
	return(data)
	}

	data = generate_func()
	objects(data)

	# train boosting regression tree
	GBR = function(x,y,rate=1,iter=100){
	# iter is iterate number, higher is better, but be carefule overfit.
	# rate is learning speed, lower is better, but too slow will cause longer time.
	library(rpart)
	max_depth=1 # tree stump
	mu = mean(y) # start with an initial model
	dy = y - mu # residuals or error, These are the parts that existing model cannot do well.
	learner = list() # define a learners holder
	for (i in 1:iter) {
	# use a weak model to improve
	learner[[i]] = rpart(dy~x, # error is target variable
	control=rpart.control(maxdepth=max_depth,cp=0)) # cp=0 means to growth a tree with any cost
	# modify residuals for next iter
	dy = dy - rate*predict(learner[[i]])
	}
	result = list('learner'=learner,
	'rate'=rate,
	'iter'=iter)
	return(result)
	}


	model = GBR(data$train_x,data$train_y,iter=1000)
	objects(model)

	# predict function
	predict_GBR = function(x,y,model){
	predict_y = list() # hold predict result
	mu = mean(y) # initial model
	n = length(y)
	iter = model$iter
	rate = model$rate
	predict_y[[1]] = rep(mu,n)
	learner = model$learner
	for (i in 2:iter) {
	# add pre-model prediction to predict y
	predict_y[[i]] = predict_y[[i-1]] + rate * predict(learner[[i-1]],newdata=data.frame(x))
	}
	# mean sqare error
	mse = sapply(predict_y,function(pre_y) round(mean((y-pre_y)^2),3))
	result = list('predict_y'=predict_y,
	'mse'= mse)
	return(result)
	}

	# predict train data
	predict_train = predict_GBR(data$train_x,data$train_y,model=model)
	objects(predict_train)

	# more trees get less error
	plot(predict_train$mse,type='l')


	# plot the effect of boosing tree
	plotfunc = function(x,y,predict,num){
	library(ggplot2)
	pre = predict$predict_y[[num]]
	plotdf = data.frame(x=x,y=y,pre = pre)
	mse = round(mean((y-pre)^2),3)
	p = ggplot(plotdf,aes(x,y)) +
	geom_point(alpha=0.5)
	p = p + geom_line(aes(x,pre)) + xlab(paste0('mse=',mse))
	plot(p)
	}

	# use mean of y to predict data
	plotfunc(data$train_x,data$train_y,predict_train,num=1)
	# add another tree model
	plotfunc(data$train_x,data$train_y,predict_train,num=2)
	# add more tree getter more result
	plotfunc(data$train_x,data$train_y,predict_train,num=10)
	plotfunc(data$train_x,data$train_y,predict_train,num=100)

	# predict test data
	predict_test = predict_GBR(data$test_x,data$test_y,model=model)
	plot(predict_test$mse,type='l')

	plotfunc(data$test_x,data$test_y,predict_test,10)
	plotfunc(data$test_x,data$test_y,predict_test,100)

	# compare different parametter
	# create 2 models with different rate
	model_1 = GBR(data$train_x,data$train_y,rate=1,iter=500)
	model_2 = GBR(data$train_x,data$train_y,rate=0.1,iter=500)

	# use train and test data, we have 4 results
	predict_train_1 = predict_GBR(data$train_x,data$train_y,model=model_1)
	predict_train_2 = predict_GBR(data$train_x,data$train_y,model=model_2)
	predict_test_1 = predict_GBR(data$test_x,data$test_y,model=model_1)
	predict_test_2 = predict_GBR(data$test_x,data$test_y,model=model_2)

	# take out mse of these 4 results
	train_error_1 = predict_train_1$mse
	train_error_2 = predict_train_2$mse
	test_error_1 = predict_test_1$mse
	test_error_2 = predict_test_2$mse
	iter = 1:model_1$iter

	# compare these mse
	plotdf = data.frame(iter,train_error_1,test_error_1,train_error_2,test_error_2)

	p = ggplot(plotdf)+
	geom_line(aes(x=iter,y=train_error_1),color='blue')+
	geom_line(aes(x=iter,y=test_error_1),color='red') +
	geom_line(aes(x=iter,y=train_error_2),linetype=2,color='blue')+
	geom_line(aes(x=iter,y=test_error_2),linetype=2,color='red')
	print(p)

	# test error is better model performance.
	# less rate is better but need more iter
	which.min(train_error_1)
	which.min(train_error_2)
	which.min(test_error_1)
	which.min(test_error_2)

	# 2. Gradient Boosting for Classification

	# read data
	data = subset(iris,Species!='virginica',select = c(1,2,5))
	data$y = ifelse(data$Species == 'setosa',1,-1)
	data$Species = NULL
	names(data)[1:2] = c('x1','x2')
	head(data)
	p = ggplot(data,aes(x1,x2,color=factor(y)))+
	geom_point()
	print(p)

	# train boosting tree for classification
	GBC = function(data,rate=0.1,iter=100){
	library(rpart)
	max_depth=1
	learner = list()
	mu = mean(data$y==1)
	# start with an initial model
	# mu=p(y=1) -> f=w*x = log(mu/(1-mu))
	f = log(mu/(1-mu))
	data$dy = data$y/(1+exp(data$y*f)) # dy is negtive gradient of log loss funtion
	for (i in 1:iter) {
	# use a weak model to improve
	learner[[i]] = rpart(dy~x1+x2,data=data,
	control=rpart.control(maxdepth=max_depth,cp=0))
	# improve model
	f = f + rate *predict(learner[[i]])
	# modify dy
	data$dy = data$y/(1+exp(data$y*f))
	}
	result = list('learner'=learner,
	'rate'=rate,
	'iter'=iter)
	return(result)
	}

	model = GBC(data,rate=0.1,iter=1000)
	objects(model)

	# predict function
	predict_GBC = function(data,model){
	predict_y = list()
	mu = mean(data$y==1)
	f = log(mu/(1-mu))
	n = nrow(data)
	iter = model$iter
	rate = model$rate
	predict_y[[1]] = rep(f,n)
	learner = model$learner
	for (i in 2:iter) {
	predict_y[[i]] = predict_y[[i-1]] + rate *predict(learner[[i-1]],newdata=data)
	}
	mse = sapply(predict_y,function(pre_y) sum(log(1+exp(-data$y*pre_y)))) # logistic loss function
	result = list('predict_y'=predict_y,
	'mse'= mse)
	return(result)
	}

	# predict data
	predict_train = predict_GBC(data,model=model)
	objects(predict_train)
	plot(predict_train$mse,type='l')

	final = predict_train$predict_y[[1000]]
	y_p = 1/(1+exp(-final))
	y_pred = ifelse(y_p>0.5,1,-1)
	table(data$y, y_pred)

	# # plot

	plotfunc2 = function(data,num,rate=0.1){
	library(ggplot2)
	model = GBC(data,rate=rate,iter=num)
	predict_train = predict_GBC(data,model=model)
	final = predict_train$predict_y[[num]]
	y_p = 1/(1+exp(-final))
	data$y_pre = ifelse(y_p>0.5,1,-1)
	tree_f = sapply(model$learner,function(x){
	temp = x$splits
	return(row.names(temp)[1])
	})
	tree_v = sapply(model$learner,function(x){
	temp = x$splits
	return(temp[1,'index'])
	})
	x1value = tree_v[tree_f=='x1']
	x2value = tree_v[tree_f=='x2']
	p = ggplot(data,aes(x=x1,y=x2))
	p = p + geom_point(aes(color=factor(y_pre)),size=5) +
	geom_point(aes(color=factor(y)),size=3)+
	geom_vline(xintercept = x1value,alpha=0.4) +
	geom_hline(yintercept = x2value,alpha=0.4) +
	scale_colour_brewer(type="seq", palette='Set1') +
	theme_bw()
	print(p)
	}

	plotfunc2(data,num=10)
	plotfunc2(data,num=20)
	plotfunc2(data,num=60)
	plotfunc2(data,num=100)
	plotfunc2(data,num=1000)