Skip to content

Instantly share code, notes, and snippets.

@klauszhang
Last active May 19, 2016 05:06
Show Gist options
  • Save klauszhang/2e0789c859ff5f6773bc8f1a15b3fade to your computer and use it in GitHub Desktop.
Save klauszhang/2e0789c859ff5f6773bc8f1a15b3fade to your computer and use it in GitHub Desktop.
boost performance
library(data.table)
#set sample size of test data
smp_size <- 10000
## set the seed to make your partition reproductible
set.seed(1234)
idx <- sample(seq_len(nrow(expedia_train)), size = smp_size)
test <- expedia_train[idx,]
train<-expedia_train[-idx,]
# how many test is going to do.
num_of_test <- 40
# by interval
by_val<-.5
# result container
final <- numeric(num_of_test)
# counter
k<-1
# sequence of test value
run_times<-seq(0,round(num_of_test*by_val), by=by_val)
for (i in run_times) {
print(i)
prop <- i * .01
sum_and_count <- function(x) {
sum(x) * (1 - prop) + length(x) * prop
}
top_n <- function(hc, v1,minVal = 5) {
hc_sorted <- hc[order(v1, decreasing = T)]
n <- min(minVal, length(hc_sorted))
paste(hc_sorted[1:n], collapse = "|")
}
hotel_with_score <-
train[, sum_and_count(is_booking),by = list(hotel_market,hotel_cluster)]
dest_top_n <-
hotel_with_score[,top_n(hotel_cluster, V1), by = hotel_market]
# generate result
output <- merge(test,dest_top_n,by = "hotel_market")[,list(V1)]
#table(output)
setnames(output, 'hotel_cluster')
# test the result
result <- logical(length(output$hotel_cluster))
for (j in 1:length(output$hotel_cluster)) {
patt <- output$hotel_cluster[j]
result[j] <- grepl(pattern = patt, x = test$hotel_cluster[j])
}
# store it in final to say how many true we have
final[k] <- prop.table(table(result))['TRUE']
print(final[k])
k<-k+1
}
# which value produce the best outcome
run_times[which(final == max(final))]
# the maximum MAP@5
max(final)
# plot graph
plot(y = final, x = run_times)
lines(final)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment