Skip to content

Instantly share code, notes, and snippets.

@klauszhang
Last active May 16, 2016 13:06
Show Gist options
  • Save klauszhang/d615add551ccc4e3785a335b60c647fb to your computer and use it in GitHub Desktop.
Save klauszhang/d615add551ccc4e3785a335b60c647fb to your computer and use it in GitHub Desktop.
to convert dates into discrete values
library(data.table)
# read csv
expedia_train <- fread('train.csv', header = T)
# create date object
dates <-
list(expedia_train$date_time,
expedia_train$srch_ci,
expedia_train$srch_co)
# covert dates
dt <- as.Date(dates[[1]], format = '%Y-%m-%d')
ci <- as.Date(dates[[2]], format = '%Y-%m-%d')
co <- as.Date(dates[[3]], format = '%Y-%m-%d')
rm(expedia_train)
del_idx <- which(is.na(ci))
del_idx <- c(del_idx, which(is.na(co)))
del_idx <- unique(del_idx)
# save to another place
data <- expedia_train
rm(expedia_train)
## remove null data
data <- data[-del_idx, ]
co <- co[-del_idx]
ci <- ci[-del_idx]
dt <- dt[-del_idx]
#calculate dates
stay_days <- co - ci
before_ci <- co - dt
# delete negative stays
del_idx <- which(stay_days < 0)
del_idx <- c(del_idx, which(before_ci < 0))
del_idx <- unique(del_idx)
# remove again
data <- data[-del_idx, ]
co <- co[-del_idx]
ci <- ci[-del_idx]
dt <- dt[-del_idx]
before_ci <- before_ci[-del_idx]
stay_days <- stay_days[-del_idx]
# clean up
rm(del_idx)
# convert stuff
before_ci <- as.integer(before_ci)
stay_days <- as.integer(stay_days)
search_month <- month(dt)
checkin_month <- month(dt)
# more clean up
rm(ci)
rm(co)
rm(dt)
data$date_time <- NULL
data$srch_ci <- NULL
data$srch_co <- NULL
# combine all together
data <-
cbind.data.frame(data, before_ci, stay_days, checkin_month, search_month)
#cleanup all others
rm(before_ci)
rm(checkin_month)
rm(search_month)
rm(stay_days)
# save the result
save(data, file='expedia_data.processed.RData')
####### calculate k mean
markets<-unique(data$hotel_market)
m1<-data[hotel_market==markets[3],]
m2<-data[hotel_market==markets[3],]
hc<-m2$hotel_cluster
m1<-cbind.data.frame(m1, hc)
# remove distance because it has null
m1$orig_destination_distance<-NULL
m1$user_id<-NULL
# remove market because it is useless
m1$hotel_market<-NULL
library(FSelector)
m1$hotel_continent<-NULL
m1$hotel_country<-NULL
m1$hc<-as.factor(m1$hc)
m1+2
weights<-information.gain(hc~., m1)
m1<-scale(m1)
m1<-as.data.frame(m1)
cl<-kmeans(m1, 8)
sort(table(hc[which(cl$cluster==1)]),decreasing = T)
sort(table(hc),decreasing = T)
hist(hc)
hist(hc[which(cl$cluster==8)])
length(unique(hc[which(cl$cluster==3)]))
length(unique(hc))
table(hc)
table(cl$cluster)
cutree(m1, 8)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment