Skip to content

Instantly share code, notes, and snippets.

@stormxuwz
Created September 22, 2016 03:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save stormxuwz/05ace10b6ab3e1f28493c68d65ad4350 to your computer and use it in GitHub Desktop.
Save stormxuwz/05ace10b6ab3e1f28493c68d65ad4350 to your computer and use it in GitHub Desktop.
randomForest_clf <- function(matTrain,trainLabel,matTest,...){
library(randomForest)
rf_model<-randomForest(matTrain,trainLabel,ntree=500)
rf_predict<-predict(rf_model,matTest)
return(rf_predict)
}
xgboost_clf_1000 <- function(matTrain,trainLabel,matTest,...){
library(xgboost)
library(SnowballC)
# create label to index mapping
uniqueLabel <- unique(trainLabel)
nclass <- length(uniqueLabel)
indexIngredientMapping <- c(1:nclass)-1
names(indexIngredientMapping) <- uniqueLabel
dtrain <- xgb.DMatrix(as.matrix(matTrain), label=indexIngredientMapping[as.character(trainLabel)])
xgb_model <- xgboost(dtrain, max.depth = 6, eta = 0.2, nround = 1000, objective = "multi:softmax", num_class = nclass)
xgboost_predict <- predict(xgb_model, newdata = as.matrix(matTest))
xgboost_predict<-names(indexIngredientMapping[xgboost_predict+1])
return(xgboost_predict)
}
xgboost_clf_200 <- function(matTrain,trainLabel,matTest,...){
library(xgboost)
library(SnowballC)
# create label to index mapping
uniqueLabel <- unique(trainLabel)
nclass <- length(uniqueLabel)
indexIngredientMapping <- c(1:nclass)-1
names(indexIngredientMapping) <- uniqueLabel
dtrain <- xgb.DMatrix(as.matrix(matTrain), label=indexIngredientMapping[as.character(trainLabel)])
xgb_model <- xgboost(dtrain, max.depth = 25, eta = 0.3, nround = 5, objective = "multi:softmax", num_class = nclass)
xgboost_predict <- predict(xgb_model, newdata = as.matrix(matTest))
xgboost_predict<-names(indexIngredientMapping[xgboost_predict+1])
return(xgboost_predict)
}
knn_clf <- function(matTrain,trainLabel,matTest,...){
library(caret)
knn_predict<-knn3Train(matTrain,matTest,trainLabel,k=5,prob=FALSE)
return(knn_predict)
}
neuralnet_clf <- function(matTrain,trainLabel,matTest,...){
library(neuralnet)
newY <- as.data.frame(model.matrix(~-1+trainLabel))
matTrain <- cbind(matTrain,newY)
n <- names(matTrain)
f <- as.formula(paste(paste(names(newY),collapse="+"),"~", paste(n[!n %in% names(newY)], collapse = " + ")))
nn <- neuralnet(f,data=matTrain,hidden=c(5,3),linear.output=T)
nn_predict <- compute(nn,matTest)$net.result
colnames(nn_predict) <- substring(names(newY),11)
final <- colnames(nn_predict)[apply(nn_predict, 1, which.max)]
return(final)
}
clf_controller <- function(matTrain,trainLabel,matTest,methods=c("knn_clf","neuralnet_clf"),...){
finalResult <- data.frame(id=1:nrow(matTest))
for(clf in methods){
print(paste("doing",clf))
expre <- paste(clf,"(matTrain,trainLabel,matTest)",sep="")
prediction <- eval(parse(text=expre))
finalResult[clf] <- prediction
# write.table(finalResult,file="tmp_results.csv",sep=",",row.names = FALSE)
}
return(finalResult)
}
svm_clf_radial <- function(matTrain,trainLabel,matTest,...){
library( 'e1071' )
model <- svm(matTrain,trainLabel,kernel="radial",degree=3)
res <- predict( model, newdata=matTest)
return(res)
}
svm_linear_clf <- function(matTrain,trainLabel,matTest,...){
library( 'e1071' )
model <- svm(matTrain,trainLabel,kernel="linear")
res <- predict( model, newdata=matTest)
return(res)
}
navieBaye_clf <- function(matTrain,trainLabel,matTest,...){
library( 'e1071' )
model <- naiveBayes(matTrain,trainLabel)
res <- predict( model, newdata=matTest)
return(res)
}
rm(list=ls())
library(jsonlite)
library(caret)
setwd("~/Developer/cooking")
source("./code/preprocessing.R")
source("./code/classifier.R")
source("./code/featureEngineering.R")
train_raw <- fromJSON("./data/train.json", flatten = TRUE)
test_raw <- fromJSON("./data/test.json", flatten = TRUE)
trainLabel <- as.factor(train_raw$cuisine)
createSplit <- function(p=0.8){
splitList=list()
for(i in 1:5){
train.index <- createDataPartition(trainLabel, p = p, list = FALSE)
splitList[[i]]=train.index
}
saveRDS(splitList,"./data/cv_trainIndex.rds")
}
cv_crossValidation <- function(type,featureNum){
tmp <- readRDS(paste("dataSet_final_",type,".rds",sep=""))
trainLabel <- tmp$trainLabel
feature <- tmp$feature
rm(tmp)
matTrain<-feature$trainFeature
rm(feature)
splitList <- readRDS("./data/cv_trainIndex.rds")
featureIndex <- readRDS(paste("./data/",type,"_featureIndex_",featureNum,".rds",sep=""))
# methodList <- c("xgboost_clf_200","xgboost_clf_1000","navieBaye_clf","svm_linear_clf","svm_linear_radial")
methodList <- c("xgboost_clf_200")
scores <- matrix(0,5,length(methodList))
colnames(scores) <- methodList
for(i in 1:5){
trainIndex <- splitList[[i]]
subTrain <- matTrain[trainIndex,featureIndex]
subTest <- matTrain[-trainIndex,featureIndex]
subTrainLabel <- trainLabel[trainIndex]
subTestLabel <- trainLabel[-trainIndex]
oneFoldResults <- clf_controller(subTrain,subTrainLabel,subTest,
methods=methodList)
for(j in 1:length(methodList)){
scores[i,j]=sum(oneFoldResults[,methodList[j]]==subTestLabel)/length(subTestLabel)
}
}
return(scores)
}
cv_main <- function(){
word_score_smallFeature <- cv_crossValidation("word","small")
word_score_largeFeature <- cv_crossValidation("word","large")
phrase_score_smallFeature <- cv_crossValidation("phrase","small")
phrase_score_largeFeature <- cv_crossValidation("phrase","large")
saveRDS(list(word_score_smallFeature,word_score_largeFeature,phrase_score_smallFeature,phrase_score_largeFeature),"cvResults.rds")
write.table(word_score_largeFeature,"word_score_largeFeature.csv")
write.table(word_score_smallFeature,"word_score_smallFeature.csv")
write.table(phrase_score_largeFeature,"phrase_score_largeFeature.csv")
write.table(phrase_score_smallFeature,"phrase_score_smallFeature.csv")
}
# run
cv_main()
library(tm)
createWordFeature <- function(trainData,testData){
c_ingredients <- c(Corpus(VectorSource(trainData$ingredients)), Corpus(VectorSource(testData$ingredients)))
c_ingredientsDTM <- DocumentTermMatrix(c_ingredients)
c_ingredientsDTM <- as.matrix(c_ingredientsDTM)
c_ingredientsDTM <- ifelse(c_ingredientsDTM>0,1,0)
c_ingredientsDTM <- as.data.frame(c_ingredientsDTM)
# c_ingredientsDTM$ingredients_count <- rowSums(c_ingredientsDTM)
trainFeature<-c_ingredientsDTM[1:nrow(trainData),]
testFeature<-c_ingredientsDTM[-(1:nrow(trainData)),]
trainFeature <- featureStemming(trainFeature)
testFeature <- featureStemming(testFeature)
# remove functional words
funcIndex <- names(trainFeature) %in% c("for","in","all","with")
trainFeature <- trainFeature[,!funcIndex]
testFeature <- testFeature[,!funcIndex]
# This part is important
return(list(trainFeature=trainFeature,testFeature=testFeature))
}
createPhraseFeature <- function(trainData,testData){
uniqueLabel <- unique(c(unlist(trainData$ingredients),unlist(testData$ingredients)))
trainFeature <- matrix(0,nrow(trainData),length(uniqueLabel))
testFeature <- matrix(0,nrow(testData),length(uniqueLabel))
colnames(trainFeature) <- uniqueLabel
colnames(testFeature) <- uniqueLabel
for(i in 1:nrow(trainData)){
trainFeature[i,trainData$ingredients[[i]]]=1
}
for(i in 1:nrow(testData)){
testFeature[i,testData$ingredients[[i]]]=1
}
# colFeatureCol <- colSums(trainFeature)
return(list(trainFeature=as.data.frame(trainFeature),testFeature=as.data.frame(testFeature)))
}
featureStemming <- function(data){
library(SnowballC)
n <- nrow(data)
stemmedFeature <- wordStem(names(data))
uniStemmed <- unique(stemmedFeature)
newData <- matrix(0,nrow=n,ncol=length(uniStemmed))
colnames(newData) <- uniStemmed
newData<- as.data.frame(newData)
for(i in 1:length(stemmedFeature)){
newData[,stemmedFeature[i]] <- ifelse(newData[,stemmedFeature[i]]+data[,i]>0,1,0)
}
return(newData)
}
createFeature<-function(trainData,testData,type="word"){
if(type=="word"){
featureList <- createWordFeature(trainData,testData)
}else if(type=="phrase"){
featureList <- createPhraseFeature(trainData,testData)
}
trainFeature <- featureList$trainFeature
testFeature <- featureList$testFeature
# Do feature Selection
# print("Do Feature Selection")
# featureIndex <- featureSelection(trainFeature,trainData$cuisine,thresh=2,type="mi")
# trainFeature <- trainFeature[,featureIndex]
# testFeature <- testFeature[,featureIndex]
# # Do feature engineering
# print("Do Feature Engineering")
# trainFeature <- cbind(trainFeature,featureEngineering(trainFeature))
# testFeature <- cbind(testFeature,featureEngineering(testFeature))
return(list(trainFeature=trainFeature,testFeature=testFeature))
}
featureEngineering <- function(trainFeature){
# adding ingredient num
numOfIngredients <- rowSums(trainFeature)
# adding color information
# adding the ratio of meet
additionalFeature <- data.frame(numOfIngredients=numOfinvidualWords)
return(additionalFeature)
}
featureSelection <- function(trainFeature,trainLabel,thresh,type="mi"){
library(entropy)
# feature is a data frame with each ingredients
n=ncol(trainFeature);
featureIndex <- 1:ncol(trainFeature)
featureColSum <- colSums(trainFeature)
frequencyFeature <- which(featureColSum>10)
if(type=="entropy"){
entropy=rep(0,n);
for (i in 1:n){
index=which(trainFeature[,i]==1);
entropy[i]=entropy.empirical(table(trainLabel[index]),unit="log");
}
featureIndex=which(entropy<thresh);
}
if(type=="mi"){
mi=rep(0,n);
for (i in 1:n){
mi[i]=mi.empirical(table(trainLabel,trainFeature[,i]),unit="log");
}
featureIndex=which(mi>thresh);
}
featureIndex <- intersect(featureIndex,frequencyFeature)
return(featureIndex)
}
rm(list=ls())
library(jsonlite)
setwd("~/Developer/cooking")
source("./code/preprocessing.R")
source("./code/classifier.R")
source("./code/featureEngineering.R")
main <- function(test=FALSE,createNew=FALSE,type="word"){
print("reading raw data")
train_raw <- fromJSON("./data/train.json", flatten = TRUE)
test_raw <- fromJSON("./data/test.json", flatten = TRUE)
if(test){
tmp <- readRDS("test.RDS")
matTrain <- tmp[[1]]
matTest <- tmp[[2]]
trainLabel <- as.factor(as.character(tmp[[3]]))
finalResult <- clf_controller(matTrain,trainLabel,matTest,
methods=c("knn_clf","neuralnet_clf","xgboost_clf","randomForest_clf"))
print(finalResult)
write.table(finalResult,file="test.csv",sep=",",row.names = FALSE)
}
else{
if(createNew){
print("preprocessing data")
train_raw <- preprocessing(train_raw)
test_raw <- preprocessing(test_raw)
saveRDS(list(train_raw,test_raw),"preprocessed.rds")
trainLabel<-as.factor(train_raw$cuisine)
testid <- test_raw$id
print("creating features")
feature <- createFeature(train_raw,test_raw,type)
print("saving files")
saveRDS(list(feature=feature,trainLabel=trainLabel,testid=testid),paste("dataSet_",type,".rds",sep=""))
tmp <- readRDS(paste("dataSet_",type,".rds",sep=""))
feature <- tmp$feature
matTrain<-feature$trainFeature
matTest<-feature$testFeature
Train_numOfIngredients <- unlist(lapply(train_raw$ingredients,length))
Test_numOfIngredients <- unlist(lapply(test_raw$ingredients,length))
if(type=="word")
featureIndex <- featureSelection(matTrain,trainLabel,thresh=exp(-7.5),type="mi")
else
featureIndex <- featureSelection(matTrain,trainLabel,thresh=exp(-7.8),type="mi")
cat("select",length(featureIndex),"from original feature num",ncol(matTrain))
saveRDS(featureIndex,paste("./data/",type,"_featureIndex_small.rds",sep=""))
saveRDS(c(1:(ncol(matTrain)+1)),paste("./data/",type,"_featureIndex_large.rds",sep=""))
# feature Engineering
feature$trainFeature$numOfIngredient <- Train_numOfIngredients
feature$testFeature$numOfIngredient <- Test_numOfIngredients
saveRDS(list(feature=feature,trainLabel=trainLabel,testid=testid),paste("dataSet_final_",type,".rds",sep=""))
}else{
print("reading the saved data")
tmp <- readRDS(paste("dataSet_final_",type,".rds",sep=""))
trainLabel <- tmp$trainLabel
testid <- tmp$testid
feature <- tmp$feature
rm(tmp)
matTrain<-feature$trainFeature
matTest<-feature$testFeature
rm(feature)
featureIndex <- readRDS(paste("./data/",type,"_featureIndex_",featureNum,".rds",sep=""))
matTrain <- matTrain[,featureIndex]
matTest <- matTest[,featureIndex]
methodList <- c("xgboost_clf_200","xgboost_clf_1000","navieBaye_clf","svm_linear_clf","svm_linear_radial")
finalResult <- clf_controller(matTrain,trainLabel,matTest,
methods=methodList)
finalResult$id <- testid
write.table(finalResult,file="final.csv",sep=",",row.names = FALSE)
}
}
# if(test){
# matTrain <- matTrain[1:100,1:100]
# matTest <- matTest[1:100,1:100]
# trainLabel <- trainLabel[1:100]
# saveRDS(list(matTrain,matTest,trainLabel),"test.RDS")
# }
}
# main(test=FALSE,createNew=TRUE,"word")
main(test=FALSE,createNew=TRUE,"phrase")
# main(test=FALSE,createNew=FALSE,"word")
# Train_colorNum <- rowSums(matTrain[,c("red","yellow","black","green","white","blue","purpl")])
# Train_numOfWords <- rowSums(matTrain)
preprocessing<-function(data){
data$ingredients<-lapply(data$ingredients, FUN=tolower)
# data$ingredients<-lapply(data$ingredients, FUN=function(x) gsub("-", " ", x))
# data$ingredients<-lapply(data$ingredients, FUN=function(x) gsub("_", " ", x))
data$ingredients<-lapply(data$ingredients, FUN=function(x){
x <- gsub("-", " ", x);
x <- gsub("_", " ", x);
x <- gsub("low fat", "low_fat", x);
x <- gsub("all purpose", "all_purpose", x);
x <- gsub("reduced fat","reduced_fat",x);
# x <- gsub("black pepper","black_pepper",x);
x <- gsub("olive oil","olive_oil",x);
# x <- gsub("country style","country_style",x);
x <- gsub("no calorie","no_calorie",x);
x <- gsub("no salt added","no_salt_added",x);
x <- gsub("greekstyl","greek style",x);
x <- gsub("not low fat","fat",x);
x <- gsub("dress","dressing",x);
x <- gsub("kraft","",x);
x <- gsub("color","colour",x);
# x <- gsub("ice cream","ice_cream",x);
x <- gsub(" ic "," ice ",x);
x <- gsub("lowfat","low_fat",x);
x <- gsub("knorr homestyl stock","",x);
x <- gsub("half & half","half_half",x);
x <- gsub("gluten free","gluten_free",x);
x <- gsub("long grain","long_grain",x);
x <- gsub("extra virgin","extra_virgin",x);
x <- gsub("low sodium","low_sodium",x);
x <- gsub("canola oil","canola_oil",x);
x <- gsub("five spice","five_spice",x);
x <- gsub("free range","free_range",x);
x <- gsub("fat free","fat_free",x);
x <- gsub("cream of coconut","coconut cream",x);
x <- gsub("^(.*?)®","",x);
x <- gsub("[^a-z_ ]", "", x);
x <- gsub("^ ","",x);
return(x);
})
# data$ingredients<-lapply(data$ingredients, FUN=function(x) {x <- gsub("^(.*?)®","",x);x <- gsub("[^a-z_ ]", "", x);return(gsub("^ ","",y))})
return(data)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment