Created
May 5, 2016 05:58
-
-
Save klauszhang/94cc1773497ebacce926c309d9aa554d to your computer and use it in GitHub Desktop.
try to mine expedia booking search data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# read data | |
library(data.table) | |
train<-fread('train.csv',header=T) | |
test<-fread('test.csv',header=T) | |
gc() | |
# the label: is_booking, hotel_cluster | |
head(train) | |
head(test) | |
# process the data | |
# create id for train data #### | |
id<-rownames(train) | |
train<-cbind(id=id, train) | |
rm(id) | |
# train$site_name<-as.factor(train$site_name) | |
# remove those checkin date is empty entries#### | |
record_without_ci<-train[train$srch_ci=='',] | |
head(record_without_ci$id) | |
head(train$id) | |
# to verify the data to see if it is true that all null ci entries are selected.#### | |
verify<-merge(x = train, y=record_without_ci, by="id", all.y=T) | |
head(verify) | |
# check the previous | |
res<-as.factor(verify$srch_ci.x) | |
levels(res) | |
#check the new | |
res<-as.factor(verify$srch_ci.y) | |
levels(res) | |
#cleanup | |
rm(res) | |
rm(verify) | |
# remove entries without ci/co #### | |
# ge the ids of records without ci, which we believe is not correct | |
# remove these from trian dataset. | |
without_ci_id<-record_without_ci$id | |
without_co_id<-train[train$srch_co=="",]$id | |
train<-train[!(id %in% without_ci_id),] | |
rm(without_ci_id) | |
train<-train[!(id %in% without_co_id), ] | |
rm(without_co_id) | |
# Check if all entries without ci and co have been removed | |
nrow(train[train$srch_ci=="",]) | |
nrow(train[train$srch_co=="",]) | |
rm(record_without_ci) | |
gc() | |
# deal with 0 distances #### | |
# take sample as study data #### | |
set.seed(12345) | |
study.train<-train[sample(1:nrow(train), .001*nrow(train), replace = F),] | |
# processing data #### | |
study.train$is_booking<-as.factor(study.train$is_booking) | |
study.train$hotel_cluster<-as.factor(study.train$hotel_cluster) | |
study.train$site_name<-as.factor(study.train$site_name) | |
# study | |
length(unique(study.train$srch_destination_type_id)) | |
# plot some image #### | |
counts<-table(study.train$hotel_cluster) | |
barplot(counts / sum(counts)) | |
counts<-table(study.train$is_booking) | |
barplot(counts / sum(counts)) | |
counts<-table(study.train$site_name) | |
barplot(counts / sum(counts)) | |
counts<-table(study.train$posa_continent) | |
barplot(counts / sum(counts)) | |
counts<-table(study.train$user_location_country) | |
barplot(counts / sum(counts)) | |
counts<-table(study.train$srch_destination_id) | |
barplot(counts / sum(counts)) | |
head(study.train) | |
# convert data_time to date #### | |
study.train$date_time<-as.Date(study.train$date_time, format='%Y-%m-%d') | |
study.train$srch_ci<-as.Date(study.train$srch_ci, format='%Y-%m-%d') | |
study.train$srch_co<-as.Date(study.train$srch_co, format='%Y-%m-%d') | |
# calculate the interval of time between search and check in #### | |
study.train$before_ci<-as.integer(study.train$srch_ci-study.train$date_time) | |
# calculate the time of stay #### | |
study.train$stay_time<-as.integer(study.train$srch_co-study.train$srch_ci) | |
# take a look of stay time #### | |
boxplot(x = study.train$before_ci,ylim=c(0, 100)) | |
mean(study.train$before_ci,na.rm = T) | |
##################################################################################### | |
# consider fixing the distance na by using decision tree | |
study.train[is.na(orig_destination_distance),] | |
library(rpart) | |
cols<-c(3,4,5,6,17,18,21,22,23,24,7) | |
study.train.distance<-study.train[,cols] | |
# separate into na and non na | |
distance.train<-study.train.distance[-which(is.na(study.train.distance$orig_destination_distance)),] | |
distance.result<-study.train.distance[which(is.na(study.train.distance$orig_destination_distance)),] | |
head(distance.train) | |
# consider to convert origin and dest to factor to calculate the distance. | |
# but there still some distance is missing. | |
distance.train$srch_destination_id<-as.factor(distance.train$srch_destination_id) | |
distance.train$hotel_market<-as.factor(distance.train$hotel_market) | |
distance.result$srch_destination_id<-as.factor(distance.result$srch_destination_id) | |
distance.result$hotel_market<-as.factor(distance.result$hotel_market) | |
tree<-rpart(orig_destination_distance~srch_destination_id+hotel_market,data=distance.train) | |
pred<-predict(tree,distance.result) | |
summary(pred) | |
# calculate distance. tbc... | |
##################################################################################### | |
# playground #### | |
length(which(is.na(study.train$orig_destination_distance)))/nrow(study.train) | |
# there is 36% have no distance information | |
max(study.train$orig_destination_distance, na.rm = T) | |
# the max distance is 11779 kms which make sense | |
# what if the destination and original country is the same? | |
plot(x=study.train$orig_destination_distance, | |
y=study.train$hotel_country==study.train$user_location_country) | |
nrow(study.train[study.train$hotel_country==study.train$user_location_country,]) | |
# there is not much.. but i doubt this, no body book dominstic traveling? | |
# how about continent | |
plot(x=study.train$orig_destination_distance, | |
y=study.train$posa_continent==study.train$hotel_continent) | |
nrow(study.train[study.train$posa_continent==study.train$hotel_continent,]) | |
# there is more, but this seems not correct either. | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment