shedoesdatascience/020_analyse_predictive_modelling.R

## 020_analyse_predictive_modelling.R
#****************************************************************************************
#
# PROJECT: 20181002
#
# MODULE: 020 - ANALYSE - PREDICTIVE MODELLING
#
# DESCRIPTION:

#
#
#
# STEPS
# 1.Set libraries
# 2. Set up data for modelling
# 3. Run GBM
#****************************************************************************************


##1. Set libraries ####
library(data.table)
library(dplyr)
library(caret)
library(ggplot2)


library(h2o) #no support for java 9 yet - get errors
localH2O <- h2o.init(nthreads = -1)
 h2o.init()


 ##2. Set up data for modelling ####


model_train.h2o <- as.h2o(train)
model_test.h2o <- as.h2o(test)

y_dv<- which(colnames(model_train.h2o)=="final_result")
x_iv_start<-y_dv-1
x_iv_end<-y_dv+1
x_iv<-c(1:x_iv_start,x_iv_end:length(train))


#3. Run gbm ####


gbm_model <-h2o.gbm(y=y_dv, x=x_iv, training_frame = model_train.h2o, validation_frame = model_test.h2o,
                    ntrees =500, max_depth = 4, distribution="multinomial", #for multi-classification
                    learn_rate = 0.01, seed = 1234, max_hit_ratio_k=3, nfolds = 5, keep_cross_validation_predictions = TRUE)


saveRDS(gbm_model, "./gbm_model.rds")


variable.importance.list<-as.data.frame(h2o.varimp(gbm_model))

summary(gbm_model)                   ## View information about the model.


# re-run gbm with re-defined dependent variable

y_dv<- which(colnames(model_train.h2o)=="success")

x_iv_end<-y_dv-1
x_iv<-c(1:x_iv_end)
gbm_model <-h2o.gbm(y=y_dv, x=x_iv, training_frame = model_train.h2o, validation_frame = model_test.h2o,
                    ntrees =500, max_depth = 4, distribution="bernoulli", #for binomial
                    learn_rate = 0.01, seed = 1234, max_hit_ratio_k=3, nfolds = 5, keep_cross_validation_predictions = TRUE)


saveRDS(gbm_model, "./gbm_model.rds")

# Optional: Average the holdout AUCs
cvAUCs <- sapply(sapply(gbm_model@model$cross_validation_models, `[[`, "name"), function(x) { h2o.auc(h2o.getModel(x), valid=TRUE) })

print(cvAUCs)
mean(cvAUCs)

variable.importance.list<-as.data.frame(h2o.varimp(gbm_model))


#Predict on test set
gbm.prediction = h2o.predict(gbm_model, newdata=model_test.h2o, type='response')
gbm.prediction_prob = h2o.predict(gbm_model, newdata=model_test.h2o)[,2]

predicted_values_model<-h2o.make_metrics(gbm.prediction_prob,model_test.h2o$success)

gbm.auc = h2o.auc(h2o.performance(gbm_model, newdata=model_test.h2o))


#Produce AUC curve

fpr <- h2o.fpr( h2o.performance(gbm_model, newdata=model_test.h2o) )[['fpr']]
tpr <- h2o.tpr( h2o.performance(gbm_model, newdata=model_test.h2o) )[['tpr']]
ggplot( data.table(fpr = fpr, tpr = tpr), aes(fpr, tpr) ) +
  geom_line() + theme_bw() + ggtitle( sprintf('AUC: %f', gbm.auc) )


## open_learning_data.R
#****************************************************************************************
#
# PROJECT: 20181002
#
# MODULE: 010 - SOURCE - Import and data ETL
#
# DESCRIPTION:

#
#
#
# STEPS
# 1.Set libraries and import data
# 2. Data cleaning
# 3. Partition data to test and train datasets
#
#****************************************************************************************


## 1. Set libraries and import data ####


library(data.table)
library(readr)
library(dplyr)
library(caret)
library(dummies)


ou_data<-read_csv("C:\\Users\\ACAG077\\Desktop\\R learning\\Solving Business Problems\\anonymisedData\\ou_dataset.csv")

seed=1270
## 2. Data cleaning ####
# Remove duplicate rows
unique_ou_data<-unique(ou_data) # no duplicate rows found


drops <- c("id_student","Sum_weighted_score")
unique_ou_data<-unique_ou_data[ , !(names(unique_ou_data) %in% drops)]


factor_cols <- c("code_module","code_presentation","gender","region","highest_education",
                 "imd_band","age_band","disability","final_result","trimmed_assessment_type")

ou_data_DT <- data.table(unique_ou_data)

ads1<-ou_data_DT[,(factor_cols):= lapply(.SD, as.factor), .SDcols = factor_cols]

#re-define dependent variable
ads1$success<-NA
ads1$success[ads1$final_result == 'Pass' | ads1$final_result == 'Distinction'] <- 'Y'
ads1$success[ads1$final_result == 'Withdrawn' | ads1$final_result == 'Fail'] <- 'N'

#remove final_result
ads1<-ads1[, !c("final_result"), with=FALSE]
ads1$success<-as.factor(ads1$success)

## 3. partition data to test and train datasets ####
set.seed(seed)
trainIndex <- createDataPartition(ads1$final_result, p = .8,
                                  list = FALSE,
                                  times = 1)


train = ads1[trainIndex,]
test = ads1[-trainIndex,]

saveRDS(train, "./train.rds")
saveRDS(test, "./test.rds")
	#****************************************************************************************
	#
	# PROJECT: 20181002
	#
	# MODULE: 020 - ANALYSE - PREDICTIVE MODELLING
	#
	# DESCRIPTION:

	#
	#
	#
	# STEPS
	# 1.Set libraries
	# 2. Set up data for modelling
	# 3. Run GBM
	#****************************************************************************************


	##1. Set libraries ####
	library(data.table)
	library(dplyr)
	library(caret)
	library(ggplot2)



	library(h2o) #no support for java 9 yet - get errors
	localH2O <- h2o.init(nthreads = -1)
	h2o.init()


	##2. Set up data for modelling ####



	model_train.h2o <- as.h2o(train)
	model_test.h2o <- as.h2o(test)

	y_dv<- which(colnames(model_train.h2o)=="final_result")
	x_iv_start<-y_dv-1
	x_iv_end<-y_dv+1
	x_iv<-c(1:x_iv_start,x_iv_end:length(train))


	#3. Run gbm ####


	gbm_model <-h2o.gbm(y=y_dv, x=x_iv, training_frame = model_train.h2o, validation_frame = model_test.h2o,
	ntrees =500, max_depth = 4, distribution="multinomial", #for multi-classification
	learn_rate = 0.01, seed = 1234, max_hit_ratio_k=3, nfolds = 5, keep_cross_validation_predictions = TRUE)


	saveRDS(gbm_model, "./gbm_model.rds")



	variable.importance.list<-as.data.frame(h2o.varimp(gbm_model))

	summary(gbm_model) ## View information about the model.


	# re-run gbm with re-defined dependent variable

	y_dv<- which(colnames(model_train.h2o)=="success")

	x_iv_end<-y_dv-1
	x_iv<-c(1:x_iv_end)
	gbm_model <-h2o.gbm(y=y_dv, x=x_iv, training_frame = model_train.h2o, validation_frame = model_test.h2o,
	ntrees =500, max_depth = 4, distribution="bernoulli", #for binomial
	learn_rate = 0.01, seed = 1234, max_hit_ratio_k=3, nfolds = 5, keep_cross_validation_predictions = TRUE)


	saveRDS(gbm_model, "./gbm_model.rds")

	# Optional: Average the holdout AUCs
	cvAUCs <- sapply(sapply(gbm_model@model$cross_validation_models, `[[`, "name"), function(x) { h2o.auc(h2o.getModel(x), valid=TRUE) })

	print(cvAUCs)
	mean(cvAUCs)

	variable.importance.list<-as.data.frame(h2o.varimp(gbm_model))



	#Predict on test set
	gbm.prediction = h2o.predict(gbm_model, newdata=model_test.h2o, type='response')
	gbm.prediction_prob = h2o.predict(gbm_model, newdata=model_test.h2o)[,2]

	predicted_values_model<-h2o.make_metrics(gbm.prediction_prob,model_test.h2o$success)

	gbm.auc = h2o.auc(h2o.performance(gbm_model, newdata=model_test.h2o))


	#Produce AUC curve

	fpr <- h2o.fpr( h2o.performance(gbm_model, newdata=model_test.h2o) )[['fpr']]
	tpr <- h2o.tpr( h2o.performance(gbm_model, newdata=model_test.h2o) )[['tpr']]
	ggplot( data.table(fpr = fpr, tpr = tpr), aes(fpr, tpr) ) +
	geom_line() + theme_bw() + ggtitle( sprintf('AUC: %f', gbm.auc) )
	#****************************************************************************************
	#
	# PROJECT: 20181002
	#
	# MODULE: 010 - SOURCE - Import and data ETL
	#
	# DESCRIPTION:

	#
	#
	#
	# STEPS
	# 1.Set libraries and import data
	# 2. Data cleaning
	# 3. Partition data to test and train datasets
	#
	#****************************************************************************************


	## 1. Set libraries and import data ####


	library(data.table)
	library(readr)
	library(dplyr)
	library(caret)
	library(dummies)





	ou_data<-read_csv("C:\\Users\\ACAG077\\Desktop\\R learning\\Solving Business Problems\\anonymisedData\\ou_dataset.csv")

	seed=1270
	## 2. Data cleaning ####
	# Remove duplicate rows
	unique_ou_data<-unique(ou_data) # no duplicate rows found





	drops <- c("id_student","Sum_weighted_score")
	unique_ou_data<-unique_ou_data[ , !(names(unique_ou_data) %in% drops)]


	factor_cols <- c("code_module","code_presentation","gender","region","highest_education",
	"imd_band","age_band","disability","final_result","trimmed_assessment_type")

	ou_data_DT <- data.table(unique_ou_data)

	ads1<-ou_data_DT[,(factor_cols):= lapply(.SD, as.factor), .SDcols = factor_cols]

	#re-define dependent variable
	ads1$success<-NA
	ads1$success[ads1$final_result == 'Pass' \| ads1$final_result == 'Distinction'] <- 'Y'
	ads1$success[ads1$final_result == 'Withdrawn' \| ads1$final_result == 'Fail'] <- 'N'

	#remove final_result
	ads1<-ads1[, !c("final_result"), with=FALSE]
	ads1$success<-as.factor(ads1$success)

	## 3. partition data to test and train datasets ####
	set.seed(seed)
	trainIndex <- createDataPartition(ads1$final_result, p = .8,
	list = FALSE,
	times = 1)



	train = ads1[trainIndex,]
	test = ads1[-trainIndex,]

	saveRDS(train, "./train.rds")
	saveRDS(test, "./test.rds")