stormxuwz/funcProb1.R

## funcProb1.R
 #### LDA,QDA, RDA, naive baye functions ####

 lda.model=function(traindata){
 	lda.result=lda(Y~.,data=traindata)
	return(lda.result)
 }

 lda.pred=function(model,dataSets.X){
   predresult=lapply(dataSets.X,function(X) predict(model,X)$class)
	 errorInfo(predresult,"lda")
	 return(predresult)
 }


 ####################################
 qda.model=function(traindata){
 	qda.result=qda(Y~.,data=traindata)
 	return(qda.result)
 }

 qda.pred=function(model,dataSets.X){
   predresult=lapply(dataSets.X,function(X) predict(model,X)$class)
	 errorInfo(predresult,"qda")
	 return(predresult)
 }


 ####################################

 rda.model=function(traindata){
  rda.result=rda(Y~.,data=traindata)
 	return(rda.result)
 }


 rda.pred=function(model,dataSets.X){
   predresult=lapply(dataSets.X,function(X) predict(model,X)$class)
	 errorInfo(predresult,"rda")
	 return(predresult)
 }


 ####################################

 baye.model=function(traindata){
	 baye.result=naiveBayes(Y~.,data=traindata)

  return(baye.result)

	 ##### Mannually programming ####
	 #traindata.X=traindata[,-1]
	 #Y=traindata$Y
	 #ntrain=length(Y)
	 #nfeatures=ncol(traindata.X)
	 #p=sum(Y==3)/ntrain

	 #for(i in 1:nfeatures){
		# traindata.X[,1]
	 #}
 }


 baye.pred=function(model,dataSets.X){
   predresult=lapply(dataSets.X,function(X) predict(model,X))
	 errorInfo(predresult,"baye")
	 return(predresult)
 }


## funcProb2.R
#### Functions for logistic regression model ####


logreg.select=function(traindata){
	logreg.result=glm(Y~.,data=traindata,family=binomial,maxit=200) #Choose 200 as maximum iteration
	glm0 = glm(Y ~ 1, data = traindata, family = binomial(logit))
	n=nrow(traindata)

	#step-wise model
	logreg.resultAIC=step(glm0, scope=list(upper=logreg.result), trace=F,direction="forward")
	logreg.resultBIC=step(glm0, scope=list(upper=logreg.result), trace=F,k=log(nrow(traindata)),direction="forward")


	#regression with lasso
	X=traindata[,-1]
	Y=traindata[,1]
	X=as.matrix(X)

	logreg.resultLasso=cv.glmnet(X,Y,family="binomial",alpha=1)

	return(list(fullmodel=logreg.result,AICmodel=logreg.resultAIC,BICmodel=logreg.resultBIC,lassomodel=logreg.resultLasso))
}


logreg.pred=function(log.model,dataSets.X,addtitle="logit"){

	tmpfunc=function(data.X){
		Yhat=predict(log.model,data.X,type = "response")
		Yhat=ifelse(Yhat>0.5,8,3)
		Yhat=as.factor(Yhat)
		return(Yhat)
	}

	predresult=lapply(dataSets.X,tmpfunc)
  errorInfo(predresult,addtitle)

	return(predresult)
}

logreg.lasso.pred=function(model,dataSets.X){
	tmpfunc=function(data.X){
		Yhat=predict(model,as.matrix(data.X),type="response",s="lambda.min")
		Yhat=ifelse(Yhat>0.5,8,3)
		Yhat=as.factor(Yhat)
		return(Yhat)
	}

	predresult=lapply(dataSets.X,tmpfunc)
  errorInfo(predresult,"Lasso")

	return(predresult)
}

## funcProb3.R
###### This is the funciton for SVM ########

svm.model=function(traindata,test.X,kn){
	if(kn=="quar")
	  svm.result=ksvm(Y~.,data=traindata,kernel="polydot",kpar=list(degree=2))
	else
  svm.result=ksvm(Y~.,data=traindata,kernel=kn)
  return(svm.result)
}

svm.pred=function(model,dataSets.X,addtitle=""){
 predresult=lapply(dataSets.X,function(X) predict(model,X))
 errorInfo(predresult,addtitle)
 return(predresult)
}

## funcProb4.R
##### This is a function for tree model

tree.build=function(traindata){

  ### Buld the tree
  tree.result=tree(Y~.,data=traindata,mindev=0.005,minsize=2)

	png("large_tree.png",width=800,height=800,res=100)
	plot(tree.result)
  text(tree.result)
  dev.off()

	cv=cv.tree(tree.result, K=5)
	png("tree_cv.png",width=800,height=800,res=100)
	plot(cv)
	dev.off()

	return(list(tree.result,cv))
}

tree.prune=function(tree.result,size){
  tree.prune = prune.tree(tree.result, method="deviance",best=size);

	png("Pruned_tree.png",width=800,height=800,res=100)
	plot(tree.prune)
	text(tree.prune)
	dev.off()

	return(tree.prune)
}

tree.pred=function(tree.model,dataSets.X){
	tmpfunc=function(X){
		Y_hat=predict(tree.model,X)
		### Y_hat is a matrix with probability ###
		colClass=as.numeric(c(colnames(Y_hat)[1],colnames(Y_hat)[2]))
		Y_hat=ifelse(Y_hat[,1]>Y_hat[,2],colClass[1],colClass[2])
		Y_hat=as.factor(Y_hat)
		return(Y_hat)
	}

  predresult=lapply(dataSets.X,tmpfunc)
  errorInfo(predresult)
  return(predresult)
}

## funcProb5.R
##### This is the function for random forest #####

rf.model=function(traindata){
  rf.result=randomForest(Y~.,data=traindata,importance=T,ntree=1000)  #mtry is XX by default
  sortedImpt = sort(importance(rf.result,scale = F)[,3], decreasing = T );

	png("rf_imp.png",width=800,height=800,res=100)
	barplot(sortedImpt,  horiz=TRUE, col="blue", space=.5, names.arg=substr(names(sortedImpt), 2, 5), cex.names=0.5)
  dev.off()

  return(list(rf.result,sortedImpt))
}

rf.pred=function(model,dataSets.X){
  predresult=lapply(dataSets.X,function(X) predict(model,X))
  errorInfo(predresult)
  return(predresult)
}

## funcProb6.R
### This is the function to gbm ###

gbm.model=function(traindata){
	traindata$Y=ifelse(traindata$Y==3,0,1)
	gbm.result=gbm(Y~.,data=traindata,n.trees=5000,shrinkage=0.005,bag.fraction=0.5,train.fraction=1,distribution="adaboost",cv.folds = 3)

	png("GBMiter.png",width=1000,height=800,res=100)
	#best.iter.oob <- gbm.perf(gbm.result,method="OOB")
	#best.iter.test <- gbm.perf(gbm.result,method="test")
	best.iter.cv <- gbm.perf(gbm.result,method="cv")
	dev.off()

	return(list(gbm.result,best.iter.cv))
}

gbm.pred=function(model,dataSets.X,best.iter){
  tmpfunc=function(X){
  	Yhat=predict(model,X,best.iter)
		Yhat=plogis(Yhat)  ### Convert to p value from logit value
		Yhat=ifelse(Yhat>0.5,8,3)
		Yhat=as.factor(Yhat)
  }

	predresult=lapply(dataSets.X,tmpfunc)
  errorInfo(predresult)
  return(predresult)
}
	#### LDA,QDA, RDA, naive baye functions ####

	lda.model=function(traindata){
	lda.result=lda(Y~.,data=traindata)
	return(lda.result)
	}

	lda.pred=function(model,dataSets.X){
	predresult=lapply(dataSets.X,function(X) predict(model,X)$class)
	errorInfo(predresult,"lda")
	return(predresult)
	}


	####################################
	qda.model=function(traindata){
	qda.result=qda(Y~.,data=traindata)
	return(qda.result)
	}

	qda.pred=function(model,dataSets.X){
	predresult=lapply(dataSets.X,function(X) predict(model,X)$class)
	errorInfo(predresult,"qda")
	return(predresult)
	}





	####################################

	rda.model=function(traindata){
	rda.result=rda(Y~.,data=traindata)
	return(rda.result)
	}


	rda.pred=function(model,dataSets.X){
	predresult=lapply(dataSets.X,function(X) predict(model,X)$class)
	errorInfo(predresult,"rda")
	return(predresult)
	}




	####################################

	baye.model=function(traindata){
	baye.result=naiveBayes(Y~.,data=traindata)

	return(baye.result)

	##### Mannually programming ####
	#traindata.X=traindata[,-1]
	#Y=traindata$Y
	#ntrain=length(Y)
	#nfeatures=ncol(traindata.X)
	#p=sum(Y==3)/ntrain

	#for(i in 1:nfeatures){
	# traindata.X[,1]
	#}
	}


	baye.pred=function(model,dataSets.X){
	predresult=lapply(dataSets.X,function(X) predict(model,X))
	errorInfo(predresult,"baye")
	return(predresult)
	}
	#### Functions for logistic regression model ####


	logreg.select=function(traindata){
	logreg.result=glm(Y~.,data=traindata,family=binomial,maxit=200) #Choose 200 as maximum iteration
	glm0 = glm(Y ~ 1, data = traindata, family = binomial(logit))
	n=nrow(traindata)

	#step-wise model
	logreg.resultAIC=step(glm0, scope=list(upper=logreg.result), trace=F,direction="forward")
	logreg.resultBIC=step(glm0, scope=list(upper=logreg.result), trace=F,k=log(nrow(traindata)),direction="forward")


	#regression with lasso
	X=traindata[,-1]
	Y=traindata[,1]
	X=as.matrix(X)

	logreg.resultLasso=cv.glmnet(X,Y,family="binomial",alpha=1)

	return(list(fullmodel=logreg.result,AICmodel=logreg.resultAIC,BICmodel=logreg.resultBIC,lassomodel=logreg.resultLasso))
	}


	logreg.pred=function(log.model,dataSets.X,addtitle="logit"){

	tmpfunc=function(data.X){
	Yhat=predict(log.model,data.X,type = "response")
	Yhat=ifelse(Yhat>0.5,8,3)
	Yhat=as.factor(Yhat)
	return(Yhat)
	}

	predresult=lapply(dataSets.X,tmpfunc)
	errorInfo(predresult,addtitle)

	return(predresult)
	}

	logreg.lasso.pred=function(model,dataSets.X){
	tmpfunc=function(data.X){
	Yhat=predict(model,as.matrix(data.X),type="response",s="lambda.min")
	Yhat=ifelse(Yhat>0.5,8,3)
	Yhat=as.factor(Yhat)
	return(Yhat)
	}

	predresult=lapply(dataSets.X,tmpfunc)
	errorInfo(predresult,"Lasso")

	return(predresult)
	}
	###### This is the funciton for SVM ########

	svm.model=function(traindata,test.X,kn){
	if(kn=="quar")
	svm.result=ksvm(Y~.,data=traindata,kernel="polydot",kpar=list(degree=2))
	else
	svm.result=ksvm(Y~.,data=traindata,kernel=kn)
	return(svm.result)
	}

	svm.pred=function(model,dataSets.X,addtitle=""){
	predresult=lapply(dataSets.X,function(X) predict(model,X))
	errorInfo(predresult,addtitle)
	return(predresult)
	}
	##### This is a function for tree model

	tree.build=function(traindata){

	### Buld the tree
	tree.result=tree(Y~.,data=traindata,mindev=0.005,minsize=2)

	png("large_tree.png",width=800,height=800,res=100)
	plot(tree.result)
	text(tree.result)
	dev.off()

	cv=cv.tree(tree.result, K=5)
	png("tree_cv.png",width=800,height=800,res=100)
	plot(cv)
	dev.off()

	return(list(tree.result,cv))
	}

	tree.prune=function(tree.result,size){
	tree.prune = prune.tree(tree.result, method="deviance",best=size);

	png("Pruned_tree.png",width=800,height=800,res=100)
	plot(tree.prune)
	text(tree.prune)
	dev.off()

	return(tree.prune)
	}

	tree.pred=function(tree.model,dataSets.X){
	tmpfunc=function(X){
	Y_hat=predict(tree.model,X)
	### Y_hat is a matrix with probability ###
	colClass=as.numeric(c(colnames(Y_hat)[1],colnames(Y_hat)[2]))
	Y_hat=ifelse(Y_hat[,1]>Y_hat[,2],colClass[1],colClass[2])
	Y_hat=as.factor(Y_hat)
	return(Y_hat)
	}

	predresult=lapply(dataSets.X,tmpfunc)
	errorInfo(predresult)
	return(predresult)
	}
	##### This is the function for random forest #####

	rf.model=function(traindata){
	rf.result=randomForest(Y~.,data=traindata,importance=T,ntree=1000) #mtry is XX by default
	sortedImpt = sort(importance(rf.result,scale = F)[,3], decreasing = T );

	png("rf_imp.png",width=800,height=800,res=100)
	barplot(sortedImpt, horiz=TRUE, col="blue", space=.5, names.arg=substr(names(sortedImpt), 2, 5), cex.names=0.5)
	dev.off()

	return(list(rf.result,sortedImpt))
	}

	rf.pred=function(model,dataSets.X){
	predresult=lapply(dataSets.X,function(X) predict(model,X))
	errorInfo(predresult)
	return(predresult)
	}
	### This is the function to gbm ###

	gbm.model=function(traindata){
	traindata$Y=ifelse(traindata$Y==3,0,1)
	gbm.result=gbm(Y~.,data=traindata,n.trees=5000,shrinkage=0.005,bag.fraction=0.5,train.fraction=1,distribution="adaboost",cv.folds = 3)

	png("GBMiter.png",width=1000,height=800,res=100)
	#best.iter.oob <- gbm.perf(gbm.result,method="OOB")
	#best.iter.test <- gbm.perf(gbm.result,method="test")
	best.iter.cv <- gbm.perf(gbm.result,method="cv")
	dev.off()

	return(list(gbm.result,best.iter.cv))
	}

	gbm.pred=function(model,dataSets.X,best.iter){
	tmpfunc=function(X){
	Yhat=predict(model,X,best.iter)
	Yhat=plogis(Yhat) ### Convert to p value from logit value
	Yhat=ifelse(Yhat>0.5,8,3)
	Yhat=as.factor(Yhat)
	}

	predresult=lapply(dataSets.X,tmpfunc)
	errorInfo(predresult)
	return(predresult)
	}