stormxuwz/classifier.R

## classifier.R
randomForest_clf <- function(matTrain,trainLabel,matTest,...){
	library(randomForest)
	rf_model<-randomForest(matTrain,trainLabel,ntree=500)
	rf_predict<-predict(rf_model,matTest)
	return(rf_predict)
}


xgboost_clf_1000 <- function(matTrain,trainLabel,matTest,...){
	library(xgboost)
	library(SnowballC)

	# create label to index mapping
	uniqueLabel <- unique(trainLabel)
	nclass <- length(uniqueLabel)
	indexIngredientMapping <- c(1:nclass)-1
	names(indexIngredientMapping) <- uniqueLabel

	dtrain <- xgb.DMatrix(as.matrix(matTrain), label=indexIngredientMapping[as.character(trainLabel)])
	xgb_model <- xgboost(dtrain, max.depth = 6, eta = 0.2, nround = 1000, objective = "multi:softmax", num_class = nclass)
	xgboost_predict <- predict(xgb_model, newdata = as.matrix(matTest))
	xgboost_predict<-names(indexIngredientMapping[xgboost_predict+1])
	return(xgboost_predict)
}

xgboost_clf_200 <- function(matTrain,trainLabel,matTest,...){
	library(xgboost)
	library(SnowballC)

	# create label to index mapping
	uniqueLabel <- unique(trainLabel)
	nclass <- length(uniqueLabel)
	indexIngredientMapping <- c(1:nclass)-1
	names(indexIngredientMapping) <- uniqueLabel

	dtrain <- xgb.DMatrix(as.matrix(matTrain), label=indexIngredientMapping[as.character(trainLabel)])
	xgb_model <- xgboost(dtrain, max.depth = 25, eta = 0.3, nround = 5, objective = "multi:softmax", num_class = nclass)
	xgboost_predict <- predict(xgb_model, newdata = as.matrix(matTest))
	xgboost_predict<-names(indexIngredientMapping[xgboost_predict+1])
	return(xgboost_predict)
}


knn_clf <- function(matTrain,trainLabel,matTest,...){
	library(caret)
	knn_predict<-knn3Train(matTrain,matTest,trainLabel,k=5,prob=FALSE)
	return(knn_predict)
}


neuralnet_clf <- function(matTrain,trainLabel,matTest,...){
	library(neuralnet)
	newY <- as.data.frame(model.matrix(~-1+trainLabel))
	matTrain <- cbind(matTrain,newY)

	n <- names(matTrain)
	f <- as.formula(paste(paste(names(newY),collapse="+"),"~", paste(n[!n %in% names(newY)], collapse = " + ")))

	nn <- neuralnet(f,data=matTrain,hidden=c(5,3),linear.output=T)
	nn_predict <- compute(nn,matTest)$net.result

	colnames(nn_predict) <- substring(names(newY),11)
	final <- colnames(nn_predict)[apply(nn_predict, 1, which.max)]
	return(final)
}


clf_controller <- function(matTrain,trainLabel,matTest,methods=c("knn_clf","neuralnet_clf"),...){
	finalResult <- data.frame(id=1:nrow(matTest))
	for(clf in methods){
		print(paste("doing",clf))
		expre <- paste(clf,"(matTrain,trainLabel,matTest)",sep="")
		prediction <- eval(parse(text=expre))
		finalResult[clf] <- prediction
		# write.table(finalResult,file="tmp_results.csv",sep=",",row.names = FALSE)
	}
	return(finalResult)
}

svm_clf_radial <- function(matTrain,trainLabel,matTest,...){
	library( 'e1071' )
	model <- svm(matTrain,trainLabel,kernel="radial",degree=3)
	res <- predict( model, newdata=matTest)
	return(res)
}

svm_linear_clf <- function(matTrain,trainLabel,matTest,...){
	library( 'e1071' )
	model <- svm(matTrain,trainLabel,kernel="linear")
	res <- predict( model, newdata=matTest)
	return(res)
}

navieBaye_clf <- function(matTrain,trainLabel,matTest,...){
	library( 'e1071' )
	model <- naiveBayes(matTrain,trainLabel)
	res <- predict( model, newdata=matTest)
	return(res)
}

## crossVal.R
rm(list=ls())
library(jsonlite)
library(caret)

setwd("~/Developer/cooking")
source("./code/preprocessing.R")
source("./code/classifier.R")
source("./code/featureEngineering.R")

train_raw  <- fromJSON("./data/train.json", flatten = TRUE)
test_raw <- fromJSON("./data/test.json", flatten = TRUE)
trainLabel <- as.factor(train_raw$cuisine)

createSplit <- function(p=0.8){
	splitList=list()
	for(i in 1:5){
		train.index <- createDataPartition(trainLabel, p = p, list = FALSE)
		splitList[[i]]=train.index
	}

	saveRDS(splitList,"./data/cv_trainIndex.rds")
}


cv_crossValidation <- function(type,featureNum){
	tmp <- readRDS(paste("dataSet_final_",type,".rds",sep=""))
	trainLabel <- tmp$trainLabel
	feature <- tmp$feature
	rm(tmp)

	matTrain<-feature$trainFeature
	rm(feature)

	splitList <- readRDS("./data/cv_trainIndex.rds")
	featureIndex <- readRDS(paste("./data/",type,"_featureIndex_",featureNum,".rds",sep=""))


	# methodList <- c("xgboost_clf_200","xgboost_clf_1000","navieBaye_clf","svm_linear_clf","svm_linear_radial")
	methodList <- c("xgboost_clf_200")
	scores <- matrix(0,5,length(methodList))
	colnames(scores) <- methodList

	for(i in 1:5){
		trainIndex <- splitList[[i]]
		subTrain <- matTrain[trainIndex,featureIndex]
		subTest <- matTrain[-trainIndex,featureIndex]
		subTrainLabel <- trainLabel[trainIndex]
		subTestLabel <- trainLabel[-trainIndex]

		oneFoldResults <- clf_controller(subTrain,subTrainLabel,subTest,
				methods=methodList)
		for(j in 1:length(methodList)){
			scores[i,j]=sum(oneFoldResults[,methodList[j]]==subTestLabel)/length(subTestLabel)
		}
	}

	return(scores)
}

cv_main <- function(){
	word_score_smallFeature <- cv_crossValidation("word","small")
	word_score_largeFeature <- cv_crossValidation("word","large")

	phrase_score_smallFeature <- cv_crossValidation("phrase","small")
	phrase_score_largeFeature <- cv_crossValidation("phrase","large")

	saveRDS(list(word_score_smallFeature,word_score_largeFeature,phrase_score_smallFeature,phrase_score_largeFeature),"cvResults.rds")

	write.table(word_score_largeFeature,"word_score_largeFeature.csv")
	write.table(word_score_smallFeature,"word_score_smallFeature.csv")
	write.table(phrase_score_largeFeature,"phrase_score_largeFeature.csv")
	write.table(phrase_score_smallFeature,"phrase_score_smallFeature.csv")

}

# run
cv_main()


## featureEngineering.R
library(tm)


createWordFeature <- function(trainData,testData){
	c_ingredients <- c(Corpus(VectorSource(trainData$ingredients)), Corpus(VectorSource(testData$ingredients)))
	c_ingredientsDTM <- DocumentTermMatrix(c_ingredients)
	c_ingredientsDTM <- as.matrix(c_ingredientsDTM)
	c_ingredientsDTM <- ifelse(c_ingredientsDTM>0,1,0)
	c_ingredientsDTM <- as.data.frame(c_ingredientsDTM)

	# c_ingredientsDTM$ingredients_count  <- rowSums(c_ingredientsDTM)
	trainFeature<-c_ingredientsDTM[1:nrow(trainData),]
	testFeature<-c_ingredientsDTM[-(1:nrow(trainData)),]

	trainFeature <- featureStemming(trainFeature)
	testFeature <- featureStemming(testFeature)

	# remove functional words
	funcIndex <- names(trainFeature) %in% c("for","in","all","with")
	trainFeature <- trainFeature[,!funcIndex]
	testFeature <- testFeature[,!funcIndex]

	# This part is important
	return(list(trainFeature=trainFeature,testFeature=testFeature))
}


createPhraseFeature <- function(trainData,testData){
	uniqueLabel <- unique(c(unlist(trainData$ingredients),unlist(testData$ingredients)))
	trainFeature <- matrix(0,nrow(trainData),length(uniqueLabel))
	testFeature <- matrix(0,nrow(testData),length(uniqueLabel))

	colnames(trainFeature) <- uniqueLabel
	colnames(testFeature) <- uniqueLabel

	for(i in 1:nrow(trainData)){
		trainFeature[i,trainData$ingredients[[i]]]=1
	}

	for(i in 1:nrow(testData)){
		testFeature[i,testData$ingredients[[i]]]=1
	}

	# colFeatureCol <- colSums(trainFeature)

	return(list(trainFeature=as.data.frame(trainFeature),testFeature=as.data.frame(testFeature)))
}


featureStemming <- function(data){
	library(SnowballC)
	n <- nrow(data)
	stemmedFeature <- wordStem(names(data))
	uniStemmed <- unique(stemmedFeature)

	newData <- matrix(0,nrow=n,ncol=length(uniStemmed))
	colnames(newData) <- uniStemmed

	newData<- as.data.frame(newData)

	for(i in 1:length(stemmedFeature)){
		newData[,stemmedFeature[i]] <- ifelse(newData[,stemmedFeature[i]]+data[,i]>0,1,0)
	}
	return(newData)
}


createFeature<-function(trainData,testData,type="word"){
	if(type=="word"){
		featureList <- createWordFeature(trainData,testData)
	}else if(type=="phrase"){
		featureList <- createPhraseFeature(trainData,testData)
	}

	trainFeature <- featureList$trainFeature
	testFeature <- featureList$testFeature

	# Do feature Selection
	# print("Do Feature Selection")
	# featureIndex <- featureSelection(trainFeature,trainData$cuisine,thresh=2,type="mi")

	# trainFeature <- trainFeature[,featureIndex]
	# testFeature <- testFeature[,featureIndex]

	# # Do feature engineering
	# print("Do Feature Engineering")
	# trainFeature <- cbind(trainFeature,featureEngineering(trainFeature))
	# testFeature <- cbind(testFeature,featureEngineering(testFeature))

	return(list(trainFeature=trainFeature,testFeature=testFeature))
}


featureEngineering <- function(trainFeature){
	# adding ingredient num
	numOfIngredients <- rowSums(trainFeature)

	# adding color information

	# adding the ratio of meet


	additionalFeature <- data.frame(numOfIngredients=numOfinvidualWords)
	return(additionalFeature)
}


featureSelection <- function(trainFeature,trainLabel,thresh,type="mi"){
	library(entropy)
	# feature is a data frame with each ingredients
	n=ncol(trainFeature);

	featureIndex <- 1:ncol(trainFeature)

	featureColSum <- colSums(trainFeature)

	frequencyFeature <- which(featureColSum>10)


	if(type=="entropy"){
		entropy=rep(0,n);
		for (i in 1:n){
		  index=which(trainFeature[,i]==1);
		  entropy[i]=entropy.empirical(table(trainLabel[index]),unit="log");
		}
		featureIndex=which(entropy<thresh);
	}

	if(type=="mi"){
		mi=rep(0,n);

		for (i in 1:n){
	  		mi[i]=mi.empirical(table(trainLabel,trainFeature[,i]),unit="log");
		}

		featureIndex=which(mi>thresh);
	}

	featureIndex <- intersect(featureIndex,frequencyFeature)

	return(featureIndex)
}

## main.R
rm(list=ls())

library(jsonlite)
setwd("~/Developer/cooking")
source("./code/preprocessing.R")
source("./code/classifier.R")
source("./code/featureEngineering.R")

main <- function(test=FALSE,createNew=FALSE,type="word"){
	print("reading raw data")
	train_raw  <- fromJSON("./data/train.json", flatten = TRUE)
	test_raw <- fromJSON("./data/test.json", flatten = TRUE)

	if(test){
		tmp <- readRDS("test.RDS")
		matTrain <- tmp[[1]]
		matTest <- tmp[[2]]
		trainLabel <- as.factor(as.character(tmp[[3]]))
		finalResult <- clf_controller(matTrain,trainLabel,matTest,
			methods=c("knn_clf","neuralnet_clf","xgboost_clf","randomForest_clf"))
		print(finalResult)
		write.table(finalResult,file="test.csv",sep=",",row.names = FALSE)
	}

	else{
		if(createNew){
			print("preprocessing data")
			train_raw <- preprocessing(train_raw)
			test_raw <- preprocessing(test_raw)
			saveRDS(list(train_raw,test_raw),"preprocessed.rds")

			trainLabel<-as.factor(train_raw$cuisine)
			testid <- test_raw$id

			print("creating features")
			feature <- createFeature(train_raw,test_raw,type)

			print("saving files")
			saveRDS(list(feature=feature,trainLabel=trainLabel,testid=testid),paste("dataSet_",type,".rds",sep=""))


			tmp <- readRDS(paste("dataSet_",type,".rds",sep=""))
			feature <- tmp$feature

			matTrain<-feature$trainFeature
			matTest<-feature$testFeature

			Train_numOfIngredients <- unlist(lapply(train_raw$ingredients,length))
			Test_numOfIngredients <- unlist(lapply(test_raw$ingredients,length))

			if(type=="word")
				featureIndex <- featureSelection(matTrain,trainLabel,thresh=exp(-7.5),type="mi")
			else
				featureIndex <- featureSelection(matTrain,trainLabel,thresh=exp(-7.8),type="mi")

			cat("select",length(featureIndex),"from original feature num",ncol(matTrain))

			saveRDS(featureIndex,paste("./data/",type,"_featureIndex_small.rds",sep=""))
			saveRDS(c(1:(ncol(matTrain)+1)),paste("./data/",type,"_featureIndex_large.rds",sep=""))

			# feature Engineering
			feature$trainFeature$numOfIngredient <- Train_numOfIngredients
			feature$testFeature$numOfIngredient <- Test_numOfIngredients

			saveRDS(list(feature=feature,trainLabel=trainLabel,testid=testid),paste("dataSet_final_",type,".rds",sep=""))

		}else{
			print("reading the saved data")
			tmp <- readRDS(paste("dataSet_final_",type,".rds",sep=""))
			trainLabel <- tmp$trainLabel
			testid <- tmp$testid
			feature <- tmp$feature
			rm(tmp)

			matTrain<-feature$trainFeature
			matTest<-feature$testFeature
			rm(feature)

			featureIndex <- readRDS(paste("./data/",type,"_featureIndex_",featureNum,".rds",sep=""))

			matTrain <- matTrain[,featureIndex]
			matTest <- matTest[,featureIndex]

			methodList <- c("xgboost_clf_200","xgboost_clf_1000","navieBaye_clf","svm_linear_clf","svm_linear_radial")

			finalResult <- clf_controller(matTrain,trainLabel,matTest,
				methods=methodList)

			finalResult$id <- testid
			write.table(finalResult,file="final.csv",sep=",",row.names = FALSE)
		}


	}

	# if(test){
	# 	matTrain <- matTrain[1:100,1:100]
	# 	matTest <- matTest[1:100,1:100]
	# 	trainLabel <- trainLabel[1:100]
	# 	saveRDS(list(matTrain,matTest,trainLabel),"test.RDS")
	# }
}


# main(test=FALSE,createNew=TRUE,"word")
main(test=FALSE,createNew=TRUE,"phrase")
# main(test=FALSE,createNew=FALSE,"word")


			# Train_colorNum <- rowSums(matTrain[,c("red","yellow","black","green","white","blue","purpl")])
			# Train_numOfWords <- rowSums(matTrain)

## preprocessing.R
preprocessing<-function(data){
	data$ingredients<-lapply(data$ingredients, FUN=tolower)
	# data$ingredients<-lapply(data$ingredients, FUN=function(x) gsub("-", " ", x))
	# data$ingredients<-lapply(data$ingredients, FUN=function(x) gsub("_", " ", x))

	data$ingredients<-lapply(data$ingredients, FUN=function(x){
		x <- gsub("-", " ", x);
		x <- gsub("_", " ", x);
		x <- gsub("low fat", "low_fat", x);
		x <- gsub("all purpose", "all_purpose", x);
		x <- gsub("reduced fat","reduced_fat",x);
		# x <- gsub("black pepper","black_pepper",x);
		x <- gsub("olive oil","olive_oil",x);
		# x <- gsub("country style","country_style",x);
		x <- gsub("no calorie","no_calorie",x);
		x <- gsub("no salt added","no_salt_added",x);
		x <- gsub("greekstyl","greek style",x);
		x <- gsub("not low fat","fat",x);
		x <- gsub("dress","dressing",x);
		x <- gsub("kraft","",x);
		x <- gsub("color","colour",x);
		# x <- gsub("ice cream","ice_cream",x);
		x <- gsub(" ic "," ice ",x);
		x <- gsub("lowfat","low_fat",x);
		x <- gsub("knorr homestyl stock","",x);
		x <- gsub("half & half","half_half",x);
		x <- gsub("gluten free","gluten_free",x);
		x <- gsub("long grain","long_grain",x);
		x <- gsub("extra virgin","extra_virgin",x);
		x <- gsub("low sodium","low_sodium",x);
		x <- gsub("canola oil","canola_oil",x);
		x <- gsub("five spice","five_spice",x);
		x <- gsub("free range","free_range",x);
		x <- gsub("fat free","fat_free",x);
		x <- gsub("cream of coconut","coconut cream",x);
		x <- gsub("^(.*?)®","",x);
		x <- gsub("[^a-z_ ]", "", x);
		x <- gsub("^ ","",x);

		return(x);
	})

	# data$ingredients<-lapply(data$ingredients, FUN=function(x) {x <- gsub("^(.*?)®","",x);x <- gsub("[^a-z_ ]", "", x);return(gsub("^ ","",y))})

	return(data)
}
	randomForest_clf <- function(matTrain,trainLabel,matTest,...){
	library(randomForest)
	rf_model<-randomForest(matTrain,trainLabel,ntree=500)
	rf_predict<-predict(rf_model,matTest)
	return(rf_predict)
	}


	xgboost_clf_1000 <- function(matTrain,trainLabel,matTest,...){
	library(xgboost)
	library(SnowballC)

	# create label to index mapping
	uniqueLabel <- unique(trainLabel)
	nclass <- length(uniqueLabel)
	indexIngredientMapping <- c(1:nclass)-1
	names(indexIngredientMapping) <- uniqueLabel

	dtrain <- xgb.DMatrix(as.matrix(matTrain), label=indexIngredientMapping[as.character(trainLabel)])
	xgb_model <- xgboost(dtrain, max.depth = 6, eta = 0.2, nround = 1000, objective = "multi:softmax", num_class = nclass)
	xgboost_predict <- predict(xgb_model, newdata = as.matrix(matTest))
	xgboost_predict<-names(indexIngredientMapping[xgboost_predict+1])
	return(xgboost_predict)
	}

	xgboost_clf_200 <- function(matTrain,trainLabel,matTest,...){
	library(xgboost)
	library(SnowballC)

	# create label to index mapping
	uniqueLabel <- unique(trainLabel)
	nclass <- length(uniqueLabel)
	indexIngredientMapping <- c(1:nclass)-1
	names(indexIngredientMapping) <- uniqueLabel

	dtrain <- xgb.DMatrix(as.matrix(matTrain), label=indexIngredientMapping[as.character(trainLabel)])
	xgb_model <- xgboost(dtrain, max.depth = 25, eta = 0.3, nround = 5, objective = "multi:softmax", num_class = nclass)
	xgboost_predict <- predict(xgb_model, newdata = as.matrix(matTest))
	xgboost_predict<-names(indexIngredientMapping[xgboost_predict+1])
	return(xgboost_predict)
	}



	knn_clf <- function(matTrain,trainLabel,matTest,...){
	library(caret)
	knn_predict<-knn3Train(matTrain,matTest,trainLabel,k=5,prob=FALSE)
	return(knn_predict)
	}


	neuralnet_clf <- function(matTrain,trainLabel,matTest,...){
	library(neuralnet)
	newY <- as.data.frame(model.matrix(~-1+trainLabel))
	matTrain <- cbind(matTrain,newY)

	n <- names(matTrain)
	f <- as.formula(paste(paste(names(newY),collapse="+"),"~", paste(n[!n %in% names(newY)], collapse = " + ")))

	nn <- neuralnet(f,data=matTrain,hidden=c(5,3),linear.output=T)
	nn_predict <- compute(nn,matTest)$net.result

	colnames(nn_predict) <- substring(names(newY),11)
	final <- colnames(nn_predict)[apply(nn_predict, 1, which.max)]
	return(final)
	}



	clf_controller <- function(matTrain,trainLabel,matTest,methods=c("knn_clf","neuralnet_clf"),...){
	finalResult <- data.frame(id=1:nrow(matTest))
	for(clf in methods){
	print(paste("doing",clf))
	expre <- paste(clf,"(matTrain,trainLabel,matTest)",sep="")
	prediction <- eval(parse(text=expre))
	finalResult[clf] <- prediction
	# write.table(finalResult,file="tmp_results.csv",sep=",",row.names = FALSE)
	}
	return(finalResult)
	}

	svm_clf_radial <- function(matTrain,trainLabel,matTest,...){
	library( 'e1071' )
	model <- svm(matTrain,trainLabel,kernel="radial",degree=3)
	res <- predict( model, newdata=matTest)
	return(res)
	}

	svm_linear_clf <- function(matTrain,trainLabel,matTest,...){
	library( 'e1071' )
	model <- svm(matTrain,trainLabel,kernel="linear")
	res <- predict( model, newdata=matTest)
	return(res)
	}

	navieBaye_clf <- function(matTrain,trainLabel,matTest,...){
	library( 'e1071' )
	model <- naiveBayes(matTrain,trainLabel)
	res <- predict( model, newdata=matTest)
	return(res)
	}
	rm(list=ls())
	library(jsonlite)
	library(caret)

	setwd("~/Developer/cooking")
	source("./code/preprocessing.R")
	source("./code/classifier.R")
	source("./code/featureEngineering.R")

	train_raw <- fromJSON("./data/train.json", flatten = TRUE)
	test_raw <- fromJSON("./data/test.json", flatten = TRUE)
	trainLabel <- as.factor(train_raw$cuisine)

	createSplit <- function(p=0.8){
	splitList=list()
	for(i in 1:5){
	train.index <- createDataPartition(trainLabel, p = p, list = FALSE)
	splitList[[i]]=train.index
	}

	saveRDS(splitList,"./data/cv_trainIndex.rds")
	}


	cv_crossValidation <- function(type,featureNum){
	tmp <- readRDS(paste("dataSet_final_",type,".rds",sep=""))
	trainLabel <- tmp$trainLabel
	feature <- tmp$feature
	rm(tmp)

	matTrain<-feature$trainFeature
	rm(feature)

	splitList <- readRDS("./data/cv_trainIndex.rds")
	featureIndex <- readRDS(paste("./data/",type,"_featureIndex_",featureNum,".rds",sep=""))


	# methodList <- c("xgboost_clf_200","xgboost_clf_1000","navieBaye_clf","svm_linear_clf","svm_linear_radial")
	methodList <- c("xgboost_clf_200")
	scores <- matrix(0,5,length(methodList))
	colnames(scores) <- methodList

	for(i in 1:5){
	trainIndex <- splitList[[i]]
	subTrain <- matTrain[trainIndex,featureIndex]
	subTest <- matTrain[-trainIndex,featureIndex]
	subTrainLabel <- trainLabel[trainIndex]
	subTestLabel <- trainLabel[-trainIndex]

	oneFoldResults <- clf_controller(subTrain,subTrainLabel,subTest,
	methods=methodList)
	for(j in 1:length(methodList)){
	scores[i,j]=sum(oneFoldResults[,methodList[j]]==subTestLabel)/length(subTestLabel)
	}
	}

	return(scores)
	}

	cv_main <- function(){
	word_score_smallFeature <- cv_crossValidation("word","small")
	word_score_largeFeature <- cv_crossValidation("word","large")

	phrase_score_smallFeature <- cv_crossValidation("phrase","small")
	phrase_score_largeFeature <- cv_crossValidation("phrase","large")

	saveRDS(list(word_score_smallFeature,word_score_largeFeature,phrase_score_smallFeature,phrase_score_largeFeature),"cvResults.rds")

	write.table(word_score_largeFeature,"word_score_largeFeature.csv")
	write.table(word_score_smallFeature,"word_score_smallFeature.csv")
	write.table(phrase_score_largeFeature,"phrase_score_largeFeature.csv")
	write.table(phrase_score_smallFeature,"phrase_score_smallFeature.csv")

	}

	# run
	cv_main()
	library(tm)


	createWordFeature <- function(trainData,testData){
	c_ingredients <- c(Corpus(VectorSource(trainData$ingredients)), Corpus(VectorSource(testData$ingredients)))
	c_ingredientsDTM <- DocumentTermMatrix(c_ingredients)
	c_ingredientsDTM <- as.matrix(c_ingredientsDTM)
	c_ingredientsDTM <- ifelse(c_ingredientsDTM>0,1,0)
	c_ingredientsDTM <- as.data.frame(c_ingredientsDTM)

	# c_ingredientsDTM$ingredients_count <- rowSums(c_ingredientsDTM)
	trainFeature<-c_ingredientsDTM[1:nrow(trainData),]
	testFeature<-c_ingredientsDTM[-(1:nrow(trainData)),]

	trainFeature <- featureStemming(trainFeature)
	testFeature <- featureStemming(testFeature)

	# remove functional words
	funcIndex <- names(trainFeature) %in% c("for","in","all","with")
	trainFeature <- trainFeature[,!funcIndex]
	testFeature <- testFeature[,!funcIndex]

	# This part is important
	return(list(trainFeature=trainFeature,testFeature=testFeature))
	}


	createPhraseFeature <- function(trainData,testData){
	uniqueLabel <- unique(c(unlist(trainData$ingredients),unlist(testData$ingredients)))
	trainFeature <- matrix(0,nrow(trainData),length(uniqueLabel))
	testFeature <- matrix(0,nrow(testData),length(uniqueLabel))

	colnames(trainFeature) <- uniqueLabel
	colnames(testFeature) <- uniqueLabel

	for(i in 1:nrow(trainData)){
	trainFeature[i,trainData$ingredients[[i]]]=1
	}

	for(i in 1:nrow(testData)){
	testFeature[i,testData$ingredients[[i]]]=1
	}

	# colFeatureCol <- colSums(trainFeature)

	return(list(trainFeature=as.data.frame(trainFeature),testFeature=as.data.frame(testFeature)))
	}


	featureStemming <- function(data){
	library(SnowballC)
	n <- nrow(data)
	stemmedFeature <- wordStem(names(data))
	uniStemmed <- unique(stemmedFeature)

	newData <- matrix(0,nrow=n,ncol=length(uniStemmed))
	colnames(newData) <- uniStemmed

	newData<- as.data.frame(newData)

	for(i in 1:length(stemmedFeature)){
	newData[,stemmedFeature[i]] <- ifelse(newData[,stemmedFeature[i]]+data[,i]>0,1,0)
	}
	return(newData)
	}


	createFeature<-function(trainData,testData,type="word"){
	if(type=="word"){
	featureList <- createWordFeature(trainData,testData)
	}else if(type=="phrase"){
	featureList <- createPhraseFeature(trainData,testData)
	}

	trainFeature <- featureList$trainFeature
	testFeature <- featureList$testFeature

	# Do feature Selection
	# print("Do Feature Selection")
	# featureIndex <- featureSelection(trainFeature,trainData$cuisine,thresh=2,type="mi")

	# trainFeature <- trainFeature[,featureIndex]
	# testFeature <- testFeature[,featureIndex]

	# # Do feature engineering
	# print("Do Feature Engineering")
	# trainFeature <- cbind(trainFeature,featureEngineering(trainFeature))
	# testFeature <- cbind(testFeature,featureEngineering(testFeature))

	return(list(trainFeature=trainFeature,testFeature=testFeature))
	}


	featureEngineering <- function(trainFeature){
	# adding ingredient num
	numOfIngredients <- rowSums(trainFeature)

	# adding color information

	# adding the ratio of meet


	additionalFeature <- data.frame(numOfIngredients=numOfinvidualWords)
	return(additionalFeature)
	}




	featureSelection <- function(trainFeature,trainLabel,thresh,type="mi"){
	library(entropy)
	# feature is a data frame with each ingredients
	n=ncol(trainFeature);

	featureIndex <- 1:ncol(trainFeature)

	featureColSum <- colSums(trainFeature)

	frequencyFeature <- which(featureColSum>10)


	if(type=="entropy"){
	entropy=rep(0,n);
	for (i in 1:n){
	index=which(trainFeature[,i]==1);
	entropy[i]=entropy.empirical(table(trainLabel[index]),unit="log");
	}
	featureIndex=which(entropy<thresh);
	}

	if(type=="mi"){
	mi=rep(0,n);

	for (i in 1:n){
	mi[i]=mi.empirical(table(trainLabel,trainFeature[,i]),unit="log");
	}

	featureIndex=which(mi>thresh);
	}

	featureIndex <- intersect(featureIndex,frequencyFeature)

	return(featureIndex)
	}
	preprocessing<-function(data){
	data$ingredients<-lapply(data$ingredients, FUN=tolower)
	# data$ingredients<-lapply(data$ingredients, FUN=function(x) gsub("-", " ", x))
	# data$ingredients<-lapply(data$ingredients, FUN=function(x) gsub("_", " ", x))

	data$ingredients<-lapply(data$ingredients, FUN=function(x){
	x <- gsub("-", " ", x);
	x <- gsub("_", " ", x);
	x <- gsub("low fat", "low_fat", x);
	x <- gsub("all purpose", "all_purpose", x);
	x <- gsub("reduced fat","reduced_fat",x);
	# x <- gsub("black pepper","black_pepper",x);
	x <- gsub("olive oil","olive_oil",x);
	# x <- gsub("country style","country_style",x);
	x <- gsub("no calorie","no_calorie",x);
	x <- gsub("no salt added","no_salt_added",x);
	x <- gsub("greekstyl","greek style",x);
	x <- gsub("not low fat","fat",x);
	x <- gsub("dress","dressing",x);
	x <- gsub("kraft","",x);
	x <- gsub("color","colour",x);
	# x <- gsub("ice cream","ice_cream",x);
	x <- gsub(" ic "," ice ",x);
	x <- gsub("lowfat","low_fat",x);
	x <- gsub("knorr homestyl stock","",x);
	x <- gsub("half & half","half_half",x);
	x <- gsub("gluten free","gluten_free",x);
	x <- gsub("long grain","long_grain",x);
	x <- gsub("extra virgin","extra_virgin",x);
	x <- gsub("low sodium","low_sodium",x);
	x <- gsub("canola oil","canola_oil",x);
	x <- gsub("five spice","five_spice",x);
	x <- gsub("free range","free_range",x);
	x <- gsub("fat free","fat_free",x);
	x <- gsub("cream of coconut","coconut cream",x);
	x <- gsub("^(.*?)®","",x);
	x <- gsub("[^a-z_ ]", "", x);
	x <- gsub("^ ","",x);

	return(x);
	})

	# data$ingredients<-lapply(data$ingredients, FUN=function(x) {x <- gsub("^(.*?)®","",x);x <- gsub("[^a-z_ ]", "", x);return(gsub("^ ","",y))})

	return(data)
	}