Last active
September 7, 2019 05:50
-
-
Save shedoesdatascience/fbdd1b52c5089d3b80ba512f969e704c to your computer and use it in GitHub Desktop.
web_traffic_lead_prediction.R
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## 1. Set libraries and import data #### | |
library(data.table) | |
library(dplyr) | |
library(padr) | |
library(xgboost) | |
library(Matrix) | |
library(RcppRoll) | |
library(zoo) | |
library(readr) | |
library(sqldf) | |
#how likely is a user going to submit a lead the next day? | |
data_interactions<-read_csv("C:\\Users\\srivastavas\\Downloads\\data-interactions\\data-interactions.csv") | |
## 2. Data cleaning #### | |
# Remove duplicate rows | |
unique_data_interactions<-unique(data_interactions) # duplicate rows found | |
unique_data_interactions$event<-as.factor(unique_data_interactions$event) | |
unique_data_interactions$date<-lubridate::date(unique_data_interactions$utc_timestamp) | |
sql_str<-"select userid, date | |
, SUM(CASE WHEN event = 'leads' THEN 1 ELSE 0 END) AS num_leads | |
, SUM(CASE WHEN event = 'view' THEN 1 ELSE 0 END) AS num_views | |
, SUM(CASE WHEN event = 'search' THEN 1 ELSE 0 END) AS num_searches | |
FROM unique_data_interactions | |
GROUP BY userid, date" | |
df1<-sqldf(sql_str,stringsAsFactors = FALSE) | |
# df1$month<-as.numeric(substr(df1$utc_timestamp,6,7)) | |
df1$day<-as.numeric(as.factor(weekdays(as.Date(df1$date)))) | |
df1$day_num<-as.numeric(substr(df1$date,9,10)) | |
df1<-df1[order(df1$date),] | |
#Create lag variables | |
df2<-df1 %>% | |
group_by(userid) %>% | |
mutate(lag_leads_1 = lag(num_leads,1) #lag target variable by 1 | |
, cum_sum_views = cumsum(num_views) | |
, cum_sum_searches = cumsum(num_searches) | |
, lag_views_1 = lag(num_views,1) #lag num_views by 1 day | |
, lag_searches_1 = lag(num_searches,1)#lag num_searches by 1 day | |
) | |
example<- df2[df2$userid=="0001df26-753e-4d09-9923-6fb8fe3d7a1b",] | |
#remove users who don't have previous day records as can't predict next day lead for them | |
df3<-df2[!is.na(df2$lag_leads_1),] | |
df3$lead_flag<-ifelse(df3$num_leads>=1, 1, 0) | |
#Modelling | |
#Subset data into training and test set split on different users | |
user_ids<-as.data.frame(unique(df3$userid)) | |
names(user_ids)<-"user_ids" | |
train_user_ids<-sample_frac(user_ids,0.75) | |
names(train_user_ids)<-"user_ids" | |
train <-df3[df3$userid %in% train_user_ids$user_ids,] | |
test <- anti_join(df3, train, by = c("userid" = "userid")) | |
#set up model | |
label <- train$lead_flag | |
#returns object unchanged if there are NA values | |
previous_na_action<-options('na.action') | |
options(na.action='na.pass') | |
#build matrix input for the model | |
trainMatrix<- sparse.model.matrix(~num_views+num_searches+day+lag_leads_1+cum_sum_views+cum_sum_searches | |
+lag_views_1+lag_searches_1 | |
, data = train, | |
, contrasts.arg = c('day') | |
, sparse = FALSE, sci = FALSE) | |
options(na.action=previous_na_action$na.action) | |
#create input for xgboost | |
trainDMatrix <- xgb.DMatrix(data=trainMatrix, label = label) | |
#set parameters of model | |
# Define the parameters for binomial classification | |
# num_class = length(levels(train$lead_flag)) | |
params <- list(booster = "gbtree", | |
objective = "binary:logistic", | |
eta=0.3, | |
gamma=0, | |
max_depth=4, | |
min_child_weight=1, | |
subsample=1, | |
colsample_bytree=1) | |
xgbcv <- xgb.cv( params = params, data = trainDMatrix, nrounds = 1000, | |
nfold = 5, showsd = T, stratified = T, | |
print_every_n = 10, early_stopping_rounds = 20, maximize = F) | |
num_iterations = xgbcv$best_iteration | |
model <-xgb.train(data=trainDMatrix | |
, params=params | |
, nrounds = num_iterations | |
, maximize = FALSE | |
, eval.metric = "error", eval.metric = "logloss") | |
importance<-xgb.importance(feature_names=colnames(trainDMatrix), model=model) | |
xgb.ggplot.importance(importance_matrix=importance) | |
#create test data for modelling | |
testMatrix<- sparse.model.matrix(~num_views+num_searches+day+lag_leads_1+cum_sum_views+cum_sum_searches | |
+lag_views_1+lag_searches_1 | |
, data = test, | |
, contrasts.arg = c('day') | |
, sparse = FALSE, sci = FALSE) | |
options(na.action=previous_na_action$na.action) | |
pred<-predict(model,testMatrix) | |
prediction <- as.numeric(pred > 0.5) | |
print(head(prediction)) | |
err <- mean(as.numeric(pred > 0.5) != testMatrix$label) | |
print(paste("test-error=", err)) #average error is 0.03 very low so model is good | |
result = sum(prediction==test$lead_flag)/length(test$lead_flag) | |
print(paste("Final Accuracy =",sprintf("%1.2f%%", 100*result))) #97.53% | |
importance_matrix <- xgb.importance(model = model) | |
print(importance_matrix) | |
xgb.plot.importance(importance_matrix = importance_matrix) | |
xgb.dump(model, with_stats = T) | |
# xgb.plot.tree(model = model) | |
test$prediction<-prediction | |
write.csv(test,"predictions.csv") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment