Skip to content

Instantly share code, notes, and snippets.

View klauszhang's full-sized avatar

Hao Zhang klauszhang

View GitHub Profile
@klauszhang
klauszhang / linear_regression_by_gradient_descent.R
Last active April 27, 2016 05:55 — forked from cbare/linear_regression_by_gradient_descent.R
Linear regression by gradient descent
##
## Linear regression by gradient descent
##
## A learning exercise to help build intuition about gradient descent.
## J. Christopher Bare, 2012
##
# set random seed
set.seed(12345)
@klauszhang
klauszhang / expedia.r
Created May 5, 2016 05:58
try to mine expedia booking search data
# read data
library(data.table)
train<-fread('train.csv',header=T)
test<-fread('test.csv',header=T)
gc()
# the label: is_booking, hotel_cluster
head(train)
head(test)
# process the data
size<-1000
devider<-10
Nt<-numeric(size)
result<-numeric(size/devider)
for (i in 1:size) {
X<-runif(1000, min=0, max=1)
S<-cumsum(X)
t<-100
Nt[i]<-sum(S<t)
result[i]<-sum(Nt)/i
@klauszhang
klauszhang / expedia_exp_1.r
Created May 12, 2016 06:21
expedia hotel prediction
# folked from https://www.kaggle.com/signochastic/expedia-hotel-recommendations/r-version-of-most-popular-local-hotel
## R version of most popular local hotels
library(data.table)
expedia_train <- fread('../input/train.csv', header=TRUE)
expedia_test <- fread('../input/test.csv', header=TRUE)
sum_and_count <- function(x){
sum(x)*0.835 + length(x) *0.165
}
@klauszhang
klauszhang / expedia_exp_2.r
Created May 12, 2016 06:22
expedia hotel prdiction
# folked from https://www.kaggle.com/zfturbo/expedia-hotel-recommendations/r-some-tweaks/code
## R version of most popular local hotels (change variable)
library(data.table)
expedia_train <- fread('../input/train.csv', header=TRUE)
expedia_test <- fread('../input/test.csv', header=TRUE)
sum_and_count <- function(x){
# take the weight of clicking and browsing
sum(x)*0.95 + length(x) *0.05
@klauszhang
klauszhang / expedia_top_n.r
Last active May 14, 2016 22:46
add cleanup
# Load data ####
library(data.table)
expedia_train <- fread('train.csv',header = T)
#expedia_test <- fread('test.csv',header = T)
#set training
smp_size <- floor(0.98 * nrow(expedia_train))
## set the seed to make your partition reproductible
@klauszhang
klauszhang / expedia.similarity.r
Created May 14, 2016 10:58
iterate through all userid and calculate similarity - not working, consider vectorize.
all_user_id <- unique(train$user_id)
similar_table<-list()
counter<-1
for (user in all_user_id) {
similar_user<-c(user)
current <- train[user_id == user,]
for (next_user in all_user_id[-user]) {
# calculate similarity
# this is a test
@klauszhang
klauszhang / expedia.cluster_by_market.r
Created May 14, 2016 13:17
cluster by market, using c4.5 and knn, not working...
length(unique(train$hotel_market))
hotel_markets<-unique(train$hotel_market)
market<-train[hotel_market==hotel_markets[2],]
summary(market)
unique(market$hotel_continent)
setkey(market)
market$date_time<-as.POSIXct(market$date_time)
market<-market[-which(market$srch_ci==""),]
@klauszhang
klauszhang / expedia.boost.r
Last active May 19, 2016 05:06
boost performance
library(data.table)
#set sample size of test data
smp_size <- 10000
## set the seed to make your partition reproductible
set.seed(1234)
idx <- sample(seq_len(nrow(expedia_train)), size = smp_size)
test <- expedia_train[idx,]
train<-expedia_train[-idx,]
@klauszhang
klauszhang / process_dates.r
Last active May 16, 2016 13:06
to convert dates into discrete values
library(data.table)
# read csv
expedia_train <- fread('train.csv', header = T)
# create date object
dates <-
list(expedia_train$date_time,
expedia_train$srch_ci,
expedia_train$srch_co)