Skip to content

Instantly share code, notes, and snippets.

@alfard
Last active December 28, 2015 17:29
Show Gist options
  • Save alfard/7536685 to your computer and use it in GitHub Desktop.
Save alfard/7536685 to your computer and use it in GitHub Desktop.
Code Expedia Kaggle
sample_train <- read.csv("/home/alfard/trainnrandom.csv",na.strings="NA")
sample_train$X <- NULL
sample_train$s1 <- ifelse (sample_train$click_bool==1 & sample_train$booking_bool==0,1,0)
sample_train$s2 <- ifelse (sample_train$click_bool==1 & sample_train$booking_bool==1,5,0)
sample_train$reli <- sample_train$s1+sample_train$s2
#sasfreq(reli,sample_train)
#sample_train$reli <- factor(sample_train$reli)
#sampleSMOTE <- SMOTE(reli ~ ., sample_train, perc.over = 10000,perc.under=100)
#sasfreq(reli,sampleSMOTE)
#Mise au format du train
#loop <- read.csv("~/Documents/loop.csv",na.strings="NULL")
ST <- sample_train
ST$click_bool <- NULL
#letor('booking_bool',ST2)
ST$booking_bool <- NULL
ST$s1 <- NULL
ST$s2 <- NULL
#ST$prop_id <- factor(ST$prop_id)
ST$random_bool <- NULL
ST$position <- NULL
ST$gross_bookings_usd <- NULL
ST2<-ST[order(ST$srch_id,-ST$reli),]
ST2 <- ST2[c(50,1:49)]
library(gtools)
library(dummies)
library(som)
sasfreq <- defmacro(colonne, tablo, expr={
temp<-as.data.frame(table(tablo$colonne,exclude=NULL))
temp$perc<-round((temp$Freq/sum(temp$Freq))*100,digits = 2)
#temp<- temp[order(perc,)]
})
#Effacer ST et sample_train pour recuperer memoire
ST<- NULL
sample_train <- NULL
#ST2$date_time <- as.numeric(substr(ST2$date_time, 6, 7))
###########################################################
ST2$reli<-factor(ST2$reli)
ST2$date_time <- as.numeric(substr(ST2$date_time, 6, 7))
ST2$date_time <- normalize(ST2$date_time, byrow=TRUE)
#ST2$date_time <- factor(ST2$date_time)
#t<-as.data.frame(dummy(ST2$date_time))
############Point de vente
#sasfreq(site_id,ST2)
ST2$site_id <- factor(ST2$site_id)
#t1<-as.data.frame(dummy(ST2$site_id))
############Pays d'origine de la recherche
#sasfreq(visitor_location_country_id,ST2)
#ST2$visitor_location_country_id <- factor(ST2$visitor_location_country_id)
#t2<-as.data.frame(dummy(ST2$visitor_location_country_id))
ST2$visitor_location_country_id <- factor(ST2$visitor_location_country_id)
########Type d'hotel reservé avant reservation
#sasfreq(visitor_hist_starrating,ST2)
ST2$visitor_hist_starratingNA <- ifelse (is.na(ST2$visitor_hist_starrating),1,0)
#t3<-as.data.frame(dummy(ST2$visitor_hist_starrating))
ST2$visitor_hist_starrating<-normalize(ST2$visitor_hist_starrating, byrow=TRUE)
#sasfreq(visitor_hist_adr_usd,ST2)
ST2$visitor_hist_adr_usdNA <- ifelse (is.na(ST2$visitor_hist_adr_usd),1,0)
ST2$visitor_hist_adr_usd<-normalize(ST2$visitor_hist_adr_usd, byrow=TRUE)
#sasfreq(prop_country_id,ST2)
ST2$prop_country_id <- factor(ST2$prop_country_id)
sasfreq(prop_id,ST2)
#inutile d'utiliser l'id de l'hotel: 41657
#garder propid pour plus tard
#propid <-ST2["prop_id"]
#Nombre d'étoile
#sasfreq(prop_starrating,ST2)
ST2$prop_starrating <- factor(ST2$prop_starrating)
#Evaluation de l'hotel
#sasfreq(prop_review_score,ST2)
ST2$prop_review_scoreNA <- ifelse (is.na(ST2$prop_review_score),1,0)
ST2$prop_review_score<-factor(ST2$prop_review_score)
#Dummy appartient à une chaine
#sasfreq(prop_brand_bool,ST2)
#sasfreq(prop_location_score1,ST2)
ST2$prop_location_score1<-normalize(ST2$prop_location_score1, byrow=TRUE)
#sasfreq(prop_location_score2,ST2)
ST2$prop_location_score2<-normalize(ST2$prop_location_score2, byrow=TRUE)
#Log du prix moyen de l'hotel
#sasfreq(prop_log_historical_price,ST2)
ST2$prop_log_historical_price<-normalize(ST2$prop_log_historical_price, byrow=TRUE)
#letor('position',ST2)
#Position dans le ranking d'expedia absent de test
#Prix en dollar
#sasfreq(price_usd,ST2)
ST2$price_usd<-normalize(ST2$price_usd, byrow=TRUE)
#Chiffre trop faible
#Promotion flag
#sasfreq(promotion_flag,ST2)
#letor('gross_bookings_usd',ST2)
#Montant de la transaction absent de test
#Destination ID
#sasfreq(srch_destination_id,ST2)
ST2$srch_destination_id <- factor(ST2$srch_destination_id)
#sasfreq(srch_length_of_stay,ST2)
ST2$srch_length_of_stay <- normalize(ST2$srch_length_of_stay , byrow=TRUE)
#sasfreq(srch_booking_window,ST2)
ST2$srch_booking_window <- normalize(ST2$srch_booking_window , byrow=TRUE)
#sasfreq(srch_adults_count,ST2)
ST2$srch_adults_count <- normalize(ST2$srch_adults_count , byrow=TRUE)
#sasfreq(srch_children_count,ST2)
ST2$srch_children_count <- normalize(ST2$srch_children_count , byrow=TRUE)
#Nombre de chambre
#sasfreq(srch_room_count,ST2)
ST2$srch_room_count <- normalize(ST2$srch_room_count , byrow=TRUE)
#Dummy
#sasfreq(srch_saturdy_night_bool,ST2)
#Log probabilité que l'hotel soit choisi
#sasfreq(srch_query_affinity_score,ST2)
ST2$srch_query_affinity_score <- normalize(ST2$srch_query_affinity_score , byrow=TRUE)
#sasfreq(orig_destination_distance,ST2)
ST2$orig_destination_distance <- normalize(ST2$orig_destination_distance , byrow=TRUE)
#sasfreq(random_bool,ST2)
#letor('random_bool',ST2)
#je retire cette variable pour la modelisation
#########################################################################################
#letor('comp1_rate',ST2)
ST2$comp1_rateNA <- ifelse (is.na(ST2$comp1_rate),1,0)
#letor('comp1_inv',ST2)
ST2$comp1_invNA <- ifelse (is.na(ST2$comp1_inv),1,0)
#letor('comp1_rate_percent_diff',ST2)
ST2$comp1_rate_percent_diffNA <- ifelse (is.na(ST2$comp1_rate_percent_diff),1,0)
ST2$comp1_rate_percent_diff <- normalize(ST2$comp1_rate_percent_diff , byrow=TRUE)
#letor('comp2_rate',ST2)
ST2$comp2_rateNA <- ifelse (is.na(ST2$comp2_rate),1,0)
#letor('comp2_inv',ST2)
ST2$comp2_invNA <- ifelse (is.na(ST2$comp2_inv),1,0)
#letor('comp2_rate_percent_diff',ST2)
ST2$comp2_rate_percent_diffNA <- ifelse (is.na(ST2$comp2_rate_percent_diff),1,0)
ST2$comp2_rate_percent_diff <- normalize(ST2$comp2_rate_percent_diff , byrow=TRUE)
#letor('comp3_rate',ST2)
ST2$comp3_rateNA <- ifelse (is.na(ST2$comp3_rate),1,0)
#letor('comp3_inv',ST2)
ST2$comp3_invNA <- ifelse (is.na(ST2$comp3_inv),1,0)
#letor('comp3_rate_percent_diff',ST2)
ST2$comp3_rate_percent_diffNA <- ifelse (is.na(ST2$comp3_rate_percent_diff),1,0)
ST2$comp3_rate_percent_diff <- normalize(ST2$comp3_rate_percent_diff , byrow=TRUE)
#letor('comp4_rate',ST2)
ST2$comp4_rateNA <- ifelse (is.na(ST2$comp4_rate),1,0)
#letor('comp4_inv',ST2)
ST2$comp4_invNA <- ifelse (is.na(ST2$comp4_inv),1,0)
#letor('comp4_rate_percent_diff',ST2)
ST2$comp4_rate_percent_diffNA <- ifelse (is.na(ST2$comp4_rate_percent_diff),1,0)
ST2$comp4_rate_percent_diff <- normalize(ST2$comp4_rate_percent_diff , byrow=TRUE)
#letor('comp5_rate',ST2)
ST2$comp5_rateNA <- ifelse (is.na(ST2$comp5_rate),1,0)
#letor('comp5_inv',ST2)
ST2$comp5_invNA <- ifelse (is.na(ST2$comp5_inv),1,0)
#letor('comp5_rate_percent_diff',ST2)
ST2$comp5_rate_percent_diffNA <- ifelse (is.na(ST2$comp5_rate_percent_diff),1,0)
ST2$comp5_rate_percent_diff <- normalize(ST2$comp5_rate_percent_diff , byrow=TRUE)
#letor('comp6_rate',ST2)
ST2$comp6_rateNA <- ifelse (is.na(ST2$comp6_rate),1,0)
#letor('comp6_inv',ST2)
ST2$comp6_invNA <- ifelse (is.na(ST2$comp6_inv),1,0)
#letor('comp6_rate_percent_diff',ST2)
ST2$comp6_rate_percent_diffNA <- ifelse (is.na(ST2$comp6_rate_percent_diff),1,0)
ST2$comp6_rate_percent_diff <- normalize(ST2$comp6_rate_percent_diff , byrow=TRUE)
#letor('comp7_rate',ST2)
ST2$comp7_rateNA <- ifelse (is.na(ST2$comp7_rate),1,0)
#letor('comp7_inv',ST2)
ST2$comp7_invNA <- ifelse (is.na(ST2$comp7_inv),1,0)
#letor('comp7_rate_percent_diff',ST2)
ST2$comp7_rate_percent_diffNA <- ifelse (is.na(ST2$comp7_rate_percent_diff),1,0)
ST2$comp7_rate_percent_diff <- normalize(ST2$comp7_rate_percent_diff , byrow=TRUE)
#letor('comp8_rate',ST2)
ST2$comp8_rateNA <- ifelse (is.na(ST2$comp8_rate),1,0)
#letor('comp8_inv',ST2)
ST2$comp8_invNA <- ifelse (is.na(ST2$comp8_inv),1,0)
#letor('comp8_rate_percent_diff',ST2)
ST2$comp8_rate_percent_diffNA <- ifelse (is.na(ST2$comp8_rate_percent_diff),1,0)
ST2$comp8_rate_percent_diff <- normalize(ST2$comp8_rate_percent_diff , byrow=TRUE)
ST2$V4 <- with(ST2, ave(srch_id, srch_id, FUN = seq))
ST3<-ST2[!(ST2$V4 > 5),]
ST3$reli <- ifelse (ST3$reli==5,1,0)
sasfreq(reli,ST3)
#load("train.RData")
#save.image("~/1238.RData")
#load("1238.RData")
ST2 <- NULL
SOL <- NULL
SOL2 <- NULL
predictionsgbm <- NULL
predictionsgbmtest <- NULL
sample_test <- NULL
tt<-NULL
#train.fraction=0.7
ST3<-ST3[!(ST3$V4 > 5),]
sasfreq(reli,ST3)
library(gbm)
tt<-gbm(reli ~ .-srch_destination_id-V4-prop_id-srch_id, distribution="bernoulli", data=ST3,
n.trees=2000, shrinkage=0.01, cv.folds=0,train.fraction=0.5,
verbose=TRUE)
best.iter <- gbm.perf(tt,method="test")
predictionsgbm<-data.frame(predict.gbm(tt,ST2,best.iter))
###########################################
sasfreq(reli,ST2)
#Proto
SS <- cbind(predictionsgbm,ST2$reli,ST2$srch_id)
#################################################
predictionsgbmtest<-data.frame(predict.gbm(tt,T3,best.iter))
SOL <- cbind(predictionsgbmtest$predict.gbm.tt..T3..best.iter.,T3$srch_id,T3$prop_id)
SOL <- as.data.frame(SOL)
SOL2<-SOL[order(SOL$V2,-SOL$V1),]
SOL2$V1 <- NULL
write.csv(SOL2, file = "MyData61000.csv",row.names=FALSE)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment