Skip to content

Instantly share code, notes, and snippets.

@klauszhang
Created May 5, 2016 05:58
Show Gist options
  • Save klauszhang/94cc1773497ebacce926c309d9aa554d to your computer and use it in GitHub Desktop.
Save klauszhang/94cc1773497ebacce926c309d9aa554d to your computer and use it in GitHub Desktop.
try to mine expedia booking search data
# read data
library(data.table)
train<-fread('train.csv',header=T)
test<-fread('test.csv',header=T)
gc()
# the label: is_booking, hotel_cluster
head(train)
head(test)
# process the data
# create id for train data ####
id<-rownames(train)
train<-cbind(id=id, train)
rm(id)
# train$site_name<-as.factor(train$site_name)
# remove those checkin date is empty entries####
record_without_ci<-train[train$srch_ci=='',]
head(record_without_ci$id)
head(train$id)
# to verify the data to see if it is true that all null ci entries are selected.####
verify<-merge(x = train, y=record_without_ci, by="id", all.y=T)
head(verify)
# check the previous
res<-as.factor(verify$srch_ci.x)
levels(res)
#check the new
res<-as.factor(verify$srch_ci.y)
levels(res)
#cleanup
rm(res)
rm(verify)
# remove entries without ci/co ####
# ge the ids of records without ci, which we believe is not correct
# remove these from trian dataset.
without_ci_id<-record_without_ci$id
without_co_id<-train[train$srch_co=="",]$id
train<-train[!(id %in% without_ci_id),]
rm(without_ci_id)
train<-train[!(id %in% without_co_id), ]
rm(without_co_id)
# Check if all entries without ci and co have been removed
nrow(train[train$srch_ci=="",])
nrow(train[train$srch_co=="",])
rm(record_without_ci)
gc()
# deal with 0 distances ####
# take sample as study data ####
set.seed(12345)
study.train<-train[sample(1:nrow(train), .001*nrow(train), replace = F),]
# processing data ####
study.train$is_booking<-as.factor(study.train$is_booking)
study.train$hotel_cluster<-as.factor(study.train$hotel_cluster)
study.train$site_name<-as.factor(study.train$site_name)
# study
length(unique(study.train$srch_destination_type_id))
# plot some image ####
counts<-table(study.train$hotel_cluster)
barplot(counts / sum(counts))
counts<-table(study.train$is_booking)
barplot(counts / sum(counts))
counts<-table(study.train$site_name)
barplot(counts / sum(counts))
counts<-table(study.train$posa_continent)
barplot(counts / sum(counts))
counts<-table(study.train$user_location_country)
barplot(counts / sum(counts))
counts<-table(study.train$srch_destination_id)
barplot(counts / sum(counts))
head(study.train)
# convert data_time to date ####
study.train$date_time<-as.Date(study.train$date_time, format='%Y-%m-%d')
study.train$srch_ci<-as.Date(study.train$srch_ci, format='%Y-%m-%d')
study.train$srch_co<-as.Date(study.train$srch_co, format='%Y-%m-%d')
# calculate the interval of time between search and check in ####
study.train$before_ci<-as.integer(study.train$srch_ci-study.train$date_time)
# calculate the time of stay ####
study.train$stay_time<-as.integer(study.train$srch_co-study.train$srch_ci)
# take a look of stay time ####
boxplot(x = study.train$before_ci,ylim=c(0, 100))
mean(study.train$before_ci,na.rm = T)
#####################################################################################
# consider fixing the distance na by using decision tree
study.train[is.na(orig_destination_distance),]
library(rpart)
cols<-c(3,4,5,6,17,18,21,22,23,24,7)
study.train.distance<-study.train[,cols]
# separate into na and non na
distance.train<-study.train.distance[-which(is.na(study.train.distance$orig_destination_distance)),]
distance.result<-study.train.distance[which(is.na(study.train.distance$orig_destination_distance)),]
head(distance.train)
# consider to convert origin and dest to factor to calculate the distance.
# but there still some distance is missing.
distance.train$srch_destination_id<-as.factor(distance.train$srch_destination_id)
distance.train$hotel_market<-as.factor(distance.train$hotel_market)
distance.result$srch_destination_id<-as.factor(distance.result$srch_destination_id)
distance.result$hotel_market<-as.factor(distance.result$hotel_market)
tree<-rpart(orig_destination_distance~srch_destination_id+hotel_market,data=distance.train)
pred<-predict(tree,distance.result)
summary(pred)
# calculate distance. tbc...
#####################################################################################
# playground ####
length(which(is.na(study.train$orig_destination_distance)))/nrow(study.train)
# there is 36% have no distance information
max(study.train$orig_destination_distance, na.rm = T)
# the max distance is 11779 kms which make sense
# what if the destination and original country is the same?
plot(x=study.train$orig_destination_distance,
y=study.train$hotel_country==study.train$user_location_country)
nrow(study.train[study.train$hotel_country==study.train$user_location_country,])
# there is not much.. but i doubt this, no body book dominstic traveling?
# how about continent
plot(x=study.train$orig_destination_distance,
y=study.train$posa_continent==study.train$hotel_continent)
nrow(study.train[study.train$posa_continent==study.train$hotel_continent,])
# there is more, but this seems not correct either.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment