klauszhang/expedia.boost.r

## expedia.boost.r
library(data.table)

#set sample size of test data
smp_size <- 10000
## set the seed to make your partition reproductible
set.seed(1234)
idx <- sample(seq_len(nrow(expedia_train)), size = smp_size)
test <- expedia_train[idx,]
train<-expedia_train[-idx,]

# how many test is going to do.
num_of_test <- 40
# by interval
by_val<-.5
# result container
final <- numeric(num_of_test)
# counter
k<-1
# sequence of test value
run_times<-seq(0,round(num_of_test*by_val), by=by_val)

for (i in run_times) {
  print(i)
  prop <- i * .01
  sum_and_count <- function(x) {
    sum(x) * (1 - prop) + length(x) * prop
  }
  top_n <- function(hc, v1,minVal = 5) {
    hc_sorted <- hc[order(v1, decreasing = T)]
    n <- min(minVal, length(hc_sorted))
    paste(hc_sorted[1:n], collapse = "|")
  }

  hotel_with_score <-
    train[, sum_and_count(is_booking),by = list(hotel_market,hotel_cluster)]

  dest_top_n <-
    hotel_with_score[,top_n(hotel_cluster, V1), by = hotel_market]

  # generate result
  output <- merge(test,dest_top_n,by = "hotel_market")[,list(V1)]
  #table(output)
  setnames(output, 'hotel_cluster')

  # test the result
  result <- logical(length(output$hotel_cluster))
  for (j in 1:length(output$hotel_cluster)) {
    patt <- output$hotel_cluster[j]
    result[j] <- grepl(pattern = patt, x = test$hotel_cluster[j])
  }

  # store it in final to say how many true we have
  final[k] <- prop.table(table(result))['TRUE']
  print(final[k])
  k<-k+1
}
# which value produce the best outcome
run_times[which(final == max(final))]
# the maximum MAP@5
max(final)

# plot graph
plot(y = final, x = run_times)
lines(final)
	library(data.table)

	#set sample size of test data
	smp_size <- 10000
	## set the seed to make your partition reproductible
	set.seed(1234)
	idx <- sample(seq_len(nrow(expedia_train)), size = smp_size)
	test <- expedia_train[idx,]
	train<-expedia_train[-idx,]

	# how many test is going to do.
	num_of_test <- 40
	# by interval
	by_val<-.5
	# result container
	final <- numeric(num_of_test)
	# counter
	k<-1
	# sequence of test value
	run_times<-seq(0,round(num_of_test*by_val), by=by_val)

	for (i in run_times) {
	print(i)
	prop <- i * .01
	sum_and_count <- function(x) {
	sum(x) * (1 - prop) + length(x) * prop
	}
	top_n <- function(hc, v1,minVal = 5) {
	hc_sorted <- hc[order(v1, decreasing = T)]
	n <- min(minVal, length(hc_sorted))
	paste(hc_sorted[1:n], collapse = "\|")
	}

	hotel_with_score <-
	train[, sum_and_count(is_booking),by = list(hotel_market,hotel_cluster)]

	dest_top_n <-
	hotel_with_score[,top_n(hotel_cluster, V1), by = hotel_market]

	# generate result
	output <- merge(test,dest_top_n,by = "hotel_market")[,list(V1)]
	#table(output)
	setnames(output, 'hotel_cluster')

	# test the result
	result <- logical(length(output$hotel_cluster))
	for (j in 1:length(output$hotel_cluster)) {
	patt <- output$hotel_cluster[j]
	result[j] <- grepl(pattern = patt, x = test$hotel_cluster[j])
	}

	# store it in final to say how many true we have
	final[k] <- prop.table(table(result))['TRUE']
	print(final[k])
	k<-k+1
	}
	# which value produce the best outcome
	run_times[which(final == max(final))]
	# the maximum MAP@5
	max(final)

	# plot graph
	plot(y = final, x = run_times)
	lines(final)