klauszhang/expedia.r

## expedia.r
# read data
library(data.table)
train<-fread('train.csv',header=T)
test<-fread('test.csv',header=T)
gc()

# the label: is_booking, hotel_cluster
head(train)
head(test)
# process the data
# create id for train data ####
id<-rownames(train)
train<-cbind(id=id, train)
rm(id)
# train$site_name<-as.factor(train$site_name)
# remove those checkin date is empty entries####
record_without_ci<-train[train$srch_ci=='',]

head(record_without_ci$id)
head(train$id)
# to verify the data to see if it is true that all null ci entries are selected.####
verify<-merge(x = train, y=record_without_ci, by="id", all.y=T)
head(verify)

# check the previous
res<-as.factor(verify$srch_ci.x)
levels(res)

#check the new
res<-as.factor(verify$srch_ci.y)
levels(res)

#cleanup
rm(res)
rm(verify)
# remove entries without ci/co ####
# ge the ids of records without ci, which we believe is not correct
# remove these from trian dataset.
without_ci_id<-record_without_ci$id
without_co_id<-train[train$srch_co=="",]$id

train<-train[!(id %in% without_ci_id),]
rm(without_ci_id)

train<-train[!(id %in% without_co_id), ]
rm(without_co_id)

# Check if all entries without ci and co have been removed
nrow(train[train$srch_ci=="",])
nrow(train[train$srch_co=="",])

rm(record_without_ci)
gc()
# deal with 0 distances ####

# take sample as study data ####
set.seed(12345)
study.train<-train[sample(1:nrow(train), .001*nrow(train), replace = F),]
# processing data ####
study.train$is_booking<-as.factor(study.train$is_booking)
study.train$hotel_cluster<-as.factor(study.train$hotel_cluster)
study.train$site_name<-as.factor(study.train$site_name)
# study
length(unique(study.train$srch_destination_type_id))
# plot some image ####

counts<-table(study.train$hotel_cluster)
barplot(counts / sum(counts))

counts<-table(study.train$is_booking)
barplot(counts / sum(counts))

counts<-table(study.train$site_name)
barplot(counts / sum(counts))

counts<-table(study.train$posa_continent)
barplot(counts / sum(counts))

counts<-table(study.train$user_location_country)
barplot(counts / sum(counts))

counts<-table(study.train$srch_destination_id)
barplot(counts / sum(counts))

head(study.train)
# convert data_time to date ####
study.train$date_time<-as.Date(study.train$date_time, format='%Y-%m-%d')
study.train$srch_ci<-as.Date(study.train$srch_ci, format='%Y-%m-%d')
study.train$srch_co<-as.Date(study.train$srch_co, format='%Y-%m-%d')
# calculate the interval of time between search and check in ####
study.train$before_ci<-as.integer(study.train$srch_ci-study.train$date_time)
# calculate the time of stay ####
study.train$stay_time<-as.integer(study.train$srch_co-study.train$srch_ci)
# take a look of stay time ####
boxplot(x = study.train$before_ci,ylim=c(0, 100))
mean(study.train$before_ci,na.rm = T)

#####################################################################################
# consider fixing the distance na by using decision tree
study.train[is.na(orig_destination_distance),]

library(rpart)

cols<-c(3,4,5,6,17,18,21,22,23,24,7)
study.train.distance<-study.train[,cols]

# separate into na and non na
distance.train<-study.train.distance[-which(is.na(study.train.distance$orig_destination_distance)),]
distance.result<-study.train.distance[which(is.na(study.train.distance$orig_destination_distance)),]

head(distance.train)

# consider to convert origin and dest to factor to calculate the distance.
# but there still some distance is missing.
distance.train$srch_destination_id<-as.factor(distance.train$srch_destination_id)
distance.train$hotel_market<-as.factor(distance.train$hotel_market)

distance.result$srch_destination_id<-as.factor(distance.result$srch_destination_id)
distance.result$hotel_market<-as.factor(distance.result$hotel_market)


tree<-rpart(orig_destination_distance~srch_destination_id+hotel_market,data=distance.train)

pred<-predict(tree,distance.result)
summary(pred)

# calculate distance. tbc...
#####################################################################################

# playground ####
length(which(is.na(study.train$orig_destination_distance)))/nrow(study.train)
# there is 36% have no distance information

max(study.train$orig_destination_distance, na.rm = T)
# the max distance is 11779 kms which make sense

# what if the destination and original country is the same?
plot(x=study.train$orig_destination_distance,
     y=study.train$hotel_country==study.train$user_location_country)
nrow(study.train[study.train$hotel_country==study.train$user_location_country,])
# there is not much.. but i doubt this, no body book dominstic traveling?

# how about continent
plot(x=study.train$orig_destination_distance,
     y=study.train$posa_continent==study.train$hotel_continent)
nrow(study.train[study.train$posa_continent==study.train$hotel_continent,])
# there is more, but this seems not correct either.
	# read data
	library(data.table)
	train<-fread('train.csv',header=T)
	test<-fread('test.csv',header=T)
	gc()

	# the label: is_booking, hotel_cluster
	head(train)
	head(test)
	# process the data
	# create id for train data ####
	id<-rownames(train)
	train<-cbind(id=id, train)
	rm(id)
	# train$site_name<-as.factor(train$site_name)
	# remove those checkin date is empty entries####
	record_without_ci<-train[train$srch_ci=='',]

	head(record_without_ci$id)
	head(train$id)
	# to verify the data to see if it is true that all null ci entries are selected.####
	verify<-merge(x = train, y=record_without_ci, by="id", all.y=T)
	head(verify)

	# check the previous
	res<-as.factor(verify$srch_ci.x)
	levels(res)

	#check the new
	res<-as.factor(verify$srch_ci.y)
	levels(res)

	#cleanup
	rm(res)
	rm(verify)
	# remove entries without ci/co ####
	# ge the ids of records without ci, which we believe is not correct
	# remove these from trian dataset.
	without_ci_id<-record_without_ci$id
	without_co_id<-train[train$srch_co=="",]$id

	train<-train[!(id %in% without_ci_id),]
	rm(without_ci_id)

	train<-train[!(id %in% without_co_id), ]
	rm(without_co_id)

	# Check if all entries without ci and co have been removed
	nrow(train[train$srch_ci=="",])
	nrow(train[train$srch_co=="",])

	rm(record_without_ci)
	gc()
	# deal with 0 distances ####

	# take sample as study data ####
	set.seed(12345)
	study.train<-train[sample(1:nrow(train), .001*nrow(train), replace = F),]
	# processing data ####
	study.train$is_booking<-as.factor(study.train$is_booking)
	study.train$hotel_cluster<-as.factor(study.train$hotel_cluster)
	study.train$site_name<-as.factor(study.train$site_name)
	# study
	length(unique(study.train$srch_destination_type_id))
	# plot some image ####

	counts<-table(study.train$hotel_cluster)
	barplot(counts / sum(counts))

	counts<-table(study.train$is_booking)
	barplot(counts / sum(counts))

	counts<-table(study.train$site_name)
	barplot(counts / sum(counts))

	counts<-table(study.train$posa_continent)
	barplot(counts / sum(counts))

	counts<-table(study.train$user_location_country)
	barplot(counts / sum(counts))

	counts<-table(study.train$srch_destination_id)
	barplot(counts / sum(counts))

	head(study.train)
	# convert data_time to date ####
	study.train$date_time<-as.Date(study.train$date_time, format='%Y-%m-%d')
	study.train$srch_ci<-as.Date(study.train$srch_ci, format='%Y-%m-%d')
	study.train$srch_co<-as.Date(study.train$srch_co, format='%Y-%m-%d')
	# calculate the interval of time between search and check in ####
	study.train$before_ci<-as.integer(study.train$srch_ci-study.train$date_time)
	# calculate the time of stay ####
	study.train$stay_time<-as.integer(study.train$srch_co-study.train$srch_ci)
	# take a look of stay time ####
	boxplot(x = study.train$before_ci,ylim=c(0, 100))
	mean(study.train$before_ci,na.rm = T)

	#####################################################################################
	# consider fixing the distance na by using decision tree
	study.train[is.na(orig_destination_distance),]

	library(rpart)

	cols<-c(3,4,5,6,17,18,21,22,23,24,7)
	study.train.distance<-study.train[,cols]

	# separate into na and non na
	distance.train<-study.train.distance[-which(is.na(study.train.distance$orig_destination_distance)),]
	distance.result<-study.train.distance[which(is.na(study.train.distance$orig_destination_distance)),]

	head(distance.train)

	# consider to convert origin and dest to factor to calculate the distance.
	# but there still some distance is missing.
	distance.train$srch_destination_id<-as.factor(distance.train$srch_destination_id)
	distance.train$hotel_market<-as.factor(distance.train$hotel_market)

	distance.result$srch_destination_id<-as.factor(distance.result$srch_destination_id)
	distance.result$hotel_market<-as.factor(distance.result$hotel_market)


	tree<-rpart(orig_destination_distance~srch_destination_id+hotel_market,data=distance.train)

	pred<-predict(tree,distance.result)
	summary(pred)

	# calculate distance. tbc...
	#####################################################################################

	# playground ####
	length(which(is.na(study.train$orig_destination_distance)))/nrow(study.train)
	# there is 36% have no distance information

	max(study.train$orig_destination_distance, na.rm = T)
	# the max distance is 11779 kms which make sense

	# what if the destination and original country is the same?
	plot(x=study.train$orig_destination_distance,
	y=study.train$hotel_country==study.train$user_location_country)
	nrow(study.train[study.train$hotel_country==study.train$user_location_country,])
	# there is not much.. but i doubt this, no body book dominstic traveling?

	# how about continent
	plot(x=study.train$orig_destination_distance,
	y=study.train$posa_continent==study.train$hotel_continent)
	nrow(study.train[study.train$posa_continent==study.train$hotel_continent,])
	# there is more, but this seems not correct either.