Last active
December 28, 2015 17:29
-
-
Save alfard/7536685 to your computer and use it in GitHub Desktop.
Code Expedia Kaggle
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
sample_train <- read.csv("/home/alfard/trainnrandom.csv",na.strings="NA") | |
sample_train$X <- NULL | |
sample_train$s1 <- ifelse (sample_train$click_bool==1 & sample_train$booking_bool==0,1,0) | |
sample_train$s2 <- ifelse (sample_train$click_bool==1 & sample_train$booking_bool==1,5,0) | |
sample_train$reli <- sample_train$s1+sample_train$s2 | |
#sasfreq(reli,sample_train) | |
#sample_train$reli <- factor(sample_train$reli) | |
#sampleSMOTE <- SMOTE(reli ~ ., sample_train, perc.over = 10000,perc.under=100) | |
#sasfreq(reli,sampleSMOTE) | |
#Mise au format du train | |
#loop <- read.csv("~/Documents/loop.csv",na.strings="NULL") | |
ST <- sample_train | |
ST$click_bool <- NULL | |
#letor('booking_bool',ST2) | |
ST$booking_bool <- NULL | |
ST$s1 <- NULL | |
ST$s2 <- NULL | |
#ST$prop_id <- factor(ST$prop_id) | |
ST$random_bool <- NULL | |
ST$position <- NULL | |
ST$gross_bookings_usd <- NULL | |
ST2<-ST[order(ST$srch_id,-ST$reli),] | |
ST2 <- ST2[c(50,1:49)] | |
library(gtools) | |
library(dummies) | |
library(som) | |
sasfreq <- defmacro(colonne, tablo, expr={ | |
temp<-as.data.frame(table(tablo$colonne,exclude=NULL)) | |
temp$perc<-round((temp$Freq/sum(temp$Freq))*100,digits = 2) | |
#temp<- temp[order(perc,)] | |
}) | |
#Effacer ST et sample_train pour recuperer memoire | |
ST<- NULL | |
sample_train <- NULL | |
#ST2$date_time <- as.numeric(substr(ST2$date_time, 6, 7)) | |
########################################################### | |
ST2$reli<-factor(ST2$reli) | |
ST2$date_time <- as.numeric(substr(ST2$date_time, 6, 7)) | |
ST2$date_time <- normalize(ST2$date_time, byrow=TRUE) | |
#ST2$date_time <- factor(ST2$date_time) | |
#t<-as.data.frame(dummy(ST2$date_time)) | |
############Point de vente | |
#sasfreq(site_id,ST2) | |
ST2$site_id <- factor(ST2$site_id) | |
#t1<-as.data.frame(dummy(ST2$site_id)) | |
############Pays d'origine de la recherche | |
#sasfreq(visitor_location_country_id,ST2) | |
#ST2$visitor_location_country_id <- factor(ST2$visitor_location_country_id) | |
#t2<-as.data.frame(dummy(ST2$visitor_location_country_id)) | |
ST2$visitor_location_country_id <- factor(ST2$visitor_location_country_id) | |
########Type d'hotel reservé avant reservation | |
#sasfreq(visitor_hist_starrating,ST2) | |
ST2$visitor_hist_starratingNA <- ifelse (is.na(ST2$visitor_hist_starrating),1,0) | |
#t3<-as.data.frame(dummy(ST2$visitor_hist_starrating)) | |
ST2$visitor_hist_starrating<-normalize(ST2$visitor_hist_starrating, byrow=TRUE) | |
#sasfreq(visitor_hist_adr_usd,ST2) | |
ST2$visitor_hist_adr_usdNA <- ifelse (is.na(ST2$visitor_hist_adr_usd),1,0) | |
ST2$visitor_hist_adr_usd<-normalize(ST2$visitor_hist_adr_usd, byrow=TRUE) | |
#sasfreq(prop_country_id,ST2) | |
ST2$prop_country_id <- factor(ST2$prop_country_id) | |
sasfreq(prop_id,ST2) | |
#inutile d'utiliser l'id de l'hotel: 41657 | |
#garder propid pour plus tard | |
#propid <-ST2["prop_id"] | |
#Nombre d'étoile | |
#sasfreq(prop_starrating,ST2) | |
ST2$prop_starrating <- factor(ST2$prop_starrating) | |
#Evaluation de l'hotel | |
#sasfreq(prop_review_score,ST2) | |
ST2$prop_review_scoreNA <- ifelse (is.na(ST2$prop_review_score),1,0) | |
ST2$prop_review_score<-factor(ST2$prop_review_score) | |
#Dummy appartient à une chaine | |
#sasfreq(prop_brand_bool,ST2) | |
#sasfreq(prop_location_score1,ST2) | |
ST2$prop_location_score1<-normalize(ST2$prop_location_score1, byrow=TRUE) | |
#sasfreq(prop_location_score2,ST2) | |
ST2$prop_location_score2<-normalize(ST2$prop_location_score2, byrow=TRUE) | |
#Log du prix moyen de l'hotel | |
#sasfreq(prop_log_historical_price,ST2) | |
ST2$prop_log_historical_price<-normalize(ST2$prop_log_historical_price, byrow=TRUE) | |
#letor('position',ST2) | |
#Position dans le ranking d'expedia absent de test | |
#Prix en dollar | |
#sasfreq(price_usd,ST2) | |
ST2$price_usd<-normalize(ST2$price_usd, byrow=TRUE) | |
#Chiffre trop faible | |
#Promotion flag | |
#sasfreq(promotion_flag,ST2) | |
#letor('gross_bookings_usd',ST2) | |
#Montant de la transaction absent de test | |
#Destination ID | |
#sasfreq(srch_destination_id,ST2) | |
ST2$srch_destination_id <- factor(ST2$srch_destination_id) | |
#sasfreq(srch_length_of_stay,ST2) | |
ST2$srch_length_of_stay <- normalize(ST2$srch_length_of_stay , byrow=TRUE) | |
#sasfreq(srch_booking_window,ST2) | |
ST2$srch_booking_window <- normalize(ST2$srch_booking_window , byrow=TRUE) | |
#sasfreq(srch_adults_count,ST2) | |
ST2$srch_adults_count <- normalize(ST2$srch_adults_count , byrow=TRUE) | |
#sasfreq(srch_children_count,ST2) | |
ST2$srch_children_count <- normalize(ST2$srch_children_count , byrow=TRUE) | |
#Nombre de chambre | |
#sasfreq(srch_room_count,ST2) | |
ST2$srch_room_count <- normalize(ST2$srch_room_count , byrow=TRUE) | |
#Dummy | |
#sasfreq(srch_saturdy_night_bool,ST2) | |
#Log probabilité que l'hotel soit choisi | |
#sasfreq(srch_query_affinity_score,ST2) | |
ST2$srch_query_affinity_score <- normalize(ST2$srch_query_affinity_score , byrow=TRUE) | |
#sasfreq(orig_destination_distance,ST2) | |
ST2$orig_destination_distance <- normalize(ST2$orig_destination_distance , byrow=TRUE) | |
#sasfreq(random_bool,ST2) | |
#letor('random_bool',ST2) | |
#je retire cette variable pour la modelisation | |
######################################################################################### | |
#letor('comp1_rate',ST2) | |
ST2$comp1_rateNA <- ifelse (is.na(ST2$comp1_rate),1,0) | |
#letor('comp1_inv',ST2) | |
ST2$comp1_invNA <- ifelse (is.na(ST2$comp1_inv),1,0) | |
#letor('comp1_rate_percent_diff',ST2) | |
ST2$comp1_rate_percent_diffNA <- ifelse (is.na(ST2$comp1_rate_percent_diff),1,0) | |
ST2$comp1_rate_percent_diff <- normalize(ST2$comp1_rate_percent_diff , byrow=TRUE) | |
#letor('comp2_rate',ST2) | |
ST2$comp2_rateNA <- ifelse (is.na(ST2$comp2_rate),1,0) | |
#letor('comp2_inv',ST2) | |
ST2$comp2_invNA <- ifelse (is.na(ST2$comp2_inv),1,0) | |
#letor('comp2_rate_percent_diff',ST2) | |
ST2$comp2_rate_percent_diffNA <- ifelse (is.na(ST2$comp2_rate_percent_diff),1,0) | |
ST2$comp2_rate_percent_diff <- normalize(ST2$comp2_rate_percent_diff , byrow=TRUE) | |
#letor('comp3_rate',ST2) | |
ST2$comp3_rateNA <- ifelse (is.na(ST2$comp3_rate),1,0) | |
#letor('comp3_inv',ST2) | |
ST2$comp3_invNA <- ifelse (is.na(ST2$comp3_inv),1,0) | |
#letor('comp3_rate_percent_diff',ST2) | |
ST2$comp3_rate_percent_diffNA <- ifelse (is.na(ST2$comp3_rate_percent_diff),1,0) | |
ST2$comp3_rate_percent_diff <- normalize(ST2$comp3_rate_percent_diff , byrow=TRUE) | |
#letor('comp4_rate',ST2) | |
ST2$comp4_rateNA <- ifelse (is.na(ST2$comp4_rate),1,0) | |
#letor('comp4_inv',ST2) | |
ST2$comp4_invNA <- ifelse (is.na(ST2$comp4_inv),1,0) | |
#letor('comp4_rate_percent_diff',ST2) | |
ST2$comp4_rate_percent_diffNA <- ifelse (is.na(ST2$comp4_rate_percent_diff),1,0) | |
ST2$comp4_rate_percent_diff <- normalize(ST2$comp4_rate_percent_diff , byrow=TRUE) | |
#letor('comp5_rate',ST2) | |
ST2$comp5_rateNA <- ifelse (is.na(ST2$comp5_rate),1,0) | |
#letor('comp5_inv',ST2) | |
ST2$comp5_invNA <- ifelse (is.na(ST2$comp5_inv),1,0) | |
#letor('comp5_rate_percent_diff',ST2) | |
ST2$comp5_rate_percent_diffNA <- ifelse (is.na(ST2$comp5_rate_percent_diff),1,0) | |
ST2$comp5_rate_percent_diff <- normalize(ST2$comp5_rate_percent_diff , byrow=TRUE) | |
#letor('comp6_rate',ST2) | |
ST2$comp6_rateNA <- ifelse (is.na(ST2$comp6_rate),1,0) | |
#letor('comp6_inv',ST2) | |
ST2$comp6_invNA <- ifelse (is.na(ST2$comp6_inv),1,0) | |
#letor('comp6_rate_percent_diff',ST2) | |
ST2$comp6_rate_percent_diffNA <- ifelse (is.na(ST2$comp6_rate_percent_diff),1,0) | |
ST2$comp6_rate_percent_diff <- normalize(ST2$comp6_rate_percent_diff , byrow=TRUE) | |
#letor('comp7_rate',ST2) | |
ST2$comp7_rateNA <- ifelse (is.na(ST2$comp7_rate),1,0) | |
#letor('comp7_inv',ST2) | |
ST2$comp7_invNA <- ifelse (is.na(ST2$comp7_inv),1,0) | |
#letor('comp7_rate_percent_diff',ST2) | |
ST2$comp7_rate_percent_diffNA <- ifelse (is.na(ST2$comp7_rate_percent_diff),1,0) | |
ST2$comp7_rate_percent_diff <- normalize(ST2$comp7_rate_percent_diff , byrow=TRUE) | |
#letor('comp8_rate',ST2) | |
ST2$comp8_rateNA <- ifelse (is.na(ST2$comp8_rate),1,0) | |
#letor('comp8_inv',ST2) | |
ST2$comp8_invNA <- ifelse (is.na(ST2$comp8_inv),1,0) | |
#letor('comp8_rate_percent_diff',ST2) | |
ST2$comp8_rate_percent_diffNA <- ifelse (is.na(ST2$comp8_rate_percent_diff),1,0) | |
ST2$comp8_rate_percent_diff <- normalize(ST2$comp8_rate_percent_diff , byrow=TRUE) | |
ST2$V4 <- with(ST2, ave(srch_id, srch_id, FUN = seq)) | |
ST3<-ST2[!(ST2$V4 > 5),] | |
ST3$reli <- ifelse (ST3$reli==5,1,0) | |
sasfreq(reli,ST3) | |
#load("train.RData") | |
#save.image("~/1238.RData") | |
#load("1238.RData") | |
ST2 <- NULL | |
SOL <- NULL | |
SOL2 <- NULL | |
predictionsgbm <- NULL | |
predictionsgbmtest <- NULL | |
sample_test <- NULL | |
tt<-NULL | |
#train.fraction=0.7 | |
ST3<-ST3[!(ST3$V4 > 5),] | |
sasfreq(reli,ST3) | |
library(gbm) | |
tt<-gbm(reli ~ .-srch_destination_id-V4-prop_id-srch_id, distribution="bernoulli", data=ST3, | |
n.trees=2000, shrinkage=0.01, cv.folds=0,train.fraction=0.5, | |
verbose=TRUE) | |
best.iter <- gbm.perf(tt,method="test") | |
predictionsgbm<-data.frame(predict.gbm(tt,ST2,best.iter)) | |
########################################### | |
sasfreq(reli,ST2) | |
#Proto | |
SS <- cbind(predictionsgbm,ST2$reli,ST2$srch_id) | |
################################################# | |
predictionsgbmtest<-data.frame(predict.gbm(tt,T3,best.iter)) | |
SOL <- cbind(predictionsgbmtest$predict.gbm.tt..T3..best.iter.,T3$srch_id,T3$prop_id) | |
SOL <- as.data.frame(SOL) | |
SOL2<-SOL[order(SOL$V2,-SOL$V1),] | |
SOL2$V1 <- NULL | |
write.csv(SOL2, file = "MyData61000.csv",row.names=FALSE) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment