Skip to content

Instantly share code, notes, and snippets.

@shedoesdatascience
Last active September 7, 2019 05:50
Show Gist options
  • Save shedoesdatascience/fbdd1b52c5089d3b80ba512f969e704c to your computer and use it in GitHub Desktop.
Save shedoesdatascience/fbdd1b52c5089d3b80ba512f969e704c to your computer and use it in GitHub Desktop.
web_traffic_lead_prediction.R
## 1. Set libraries and import data ####
library(data.table)
library(dplyr)
library(padr)
library(xgboost)
library(Matrix)
library(RcppRoll)
library(zoo)
library(readr)
library(sqldf)
#how likely is a user going to submit a lead the next day?
data_interactions<-read_csv("C:\\Users\\srivastavas\\Downloads\\data-interactions\\data-interactions.csv")
## 2. Data cleaning ####
# Remove duplicate rows
unique_data_interactions<-unique(data_interactions) # duplicate rows found
unique_data_interactions$event<-as.factor(unique_data_interactions$event)
unique_data_interactions$date<-lubridate::date(unique_data_interactions$utc_timestamp)
sql_str<-"select userid, date
, SUM(CASE WHEN event = 'leads' THEN 1 ELSE 0 END) AS num_leads
, SUM(CASE WHEN event = 'view' THEN 1 ELSE 0 END) AS num_views
, SUM(CASE WHEN event = 'search' THEN 1 ELSE 0 END) AS num_searches
FROM unique_data_interactions
GROUP BY userid, date"
df1<-sqldf(sql_str,stringsAsFactors = FALSE)
# df1$month<-as.numeric(substr(df1$utc_timestamp,6,7))
df1$day<-as.numeric(as.factor(weekdays(as.Date(df1$date))))
df1$day_num<-as.numeric(substr(df1$date,9,10))
df1<-df1[order(df1$date),]
#Create lag variables
df2<-df1 %>%
group_by(userid) %>%
mutate(lag_leads_1 = lag(num_leads,1) #lag target variable by 1
, cum_sum_views = cumsum(num_views)
, cum_sum_searches = cumsum(num_searches)
, lag_views_1 = lag(num_views,1) #lag num_views by 1 day
, lag_searches_1 = lag(num_searches,1)#lag num_searches by 1 day
)
example<- df2[df2$userid=="0001df26-753e-4d09-9923-6fb8fe3d7a1b",]
#remove users who don't have previous day records as can't predict next day lead for them
df3<-df2[!is.na(df2$lag_leads_1),]
df3$lead_flag<-ifelse(df3$num_leads>=1, 1, 0)
#Modelling
#Subset data into training and test set split on different users
user_ids<-as.data.frame(unique(df3$userid))
names(user_ids)<-"user_ids"
train_user_ids<-sample_frac(user_ids,0.75)
names(train_user_ids)<-"user_ids"
train <-df3[df3$userid %in% train_user_ids$user_ids,]
test <- anti_join(df3, train, by = c("userid" = "userid"))
#set up model
label <- train$lead_flag
#returns object unchanged if there are NA values
previous_na_action<-options('na.action')
options(na.action='na.pass')
#build matrix input for the model
trainMatrix<- sparse.model.matrix(~num_views+num_searches+day+lag_leads_1+cum_sum_views+cum_sum_searches
+lag_views_1+lag_searches_1
, data = train,
, contrasts.arg = c('day')
, sparse = FALSE, sci = FALSE)
options(na.action=previous_na_action$na.action)
#create input for xgboost
trainDMatrix <- xgb.DMatrix(data=trainMatrix, label = label)
#set parameters of model
# Define the parameters for binomial classification
# num_class = length(levels(train$lead_flag))
params <- list(booster = "gbtree",
objective = "binary:logistic",
eta=0.3,
gamma=0,
max_depth=4,
min_child_weight=1,
subsample=1,
colsample_bytree=1)
xgbcv <- xgb.cv( params = params, data = trainDMatrix, nrounds = 1000,
nfold = 5, showsd = T, stratified = T,
print_every_n = 10, early_stopping_rounds = 20, maximize = F)
num_iterations = xgbcv$best_iteration
model <-xgb.train(data=trainDMatrix
, params=params
, nrounds = num_iterations
, maximize = FALSE
, eval.metric = "error", eval.metric = "logloss")
importance<-xgb.importance(feature_names=colnames(trainDMatrix), model=model)
xgb.ggplot.importance(importance_matrix=importance)
#create test data for modelling
testMatrix<- sparse.model.matrix(~num_views+num_searches+day+lag_leads_1+cum_sum_views+cum_sum_searches
+lag_views_1+lag_searches_1
, data = test,
, contrasts.arg = c('day')
, sparse = FALSE, sci = FALSE)
options(na.action=previous_na_action$na.action)
pred<-predict(model,testMatrix)
prediction <- as.numeric(pred > 0.5)
print(head(prediction))
err <- mean(as.numeric(pred > 0.5) != testMatrix$label)
print(paste("test-error=", err)) #average error is 0.03 very low so model is good
result = sum(prediction==test$lead_flag)/length(test$lead_flag)
print(paste("Final Accuracy =",sprintf("%1.2f%%", 100*result))) #97.53%
importance_matrix <- xgb.importance(model = model)
print(importance_matrix)
xgb.plot.importance(importance_matrix = importance_matrix)
xgb.dump(model, with_stats = T)
# xgb.plot.tree(model = model)
test$prediction<-prediction
write.csv(test,"predictions.csv")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment