Skip to content

Instantly share code, notes, and snippets.

@cytms
Last active March 15, 2018 16:50
Show Gist options
  • Save cytms/66931344faf68f41326a7c8e578736bc to your computer and use it in GitHub Desktop.
Save cytms/66931344faf68f41326a7c8e578736bc to your computer and use it in GitHub Desktop.
####################################################################################################
## Training regression models to answer questions on how many cycles will be left for an engine
## Four models will be trained:
## 1. Decision forest;
## 2. Boosted decision tree;
## 3. Poison regression modeling;
## 4. Neural network
## Input : The processed train (train_norm_data.csv) and test (test_norm_data.csv) dataset
## Output: The results on test dataset(predictions.csv) and the model evaluation metrics(model_evalutation.csv)
## edited by @cytms, time spent: 2 hours
####################################################################################################
setwd("~/Projects/m-project")
train_table <- read.table("train_norm_data.csv", header = TRUE, sep = ",")
####################################################################################################
## Find top 35 variables most correlated with RUL
####################################################################################################
train_vars <- names(train_table)
train_vars <- train_vars[!train_vars %in% c("label1", "label2", "id", "cycle_orig")]
train_table <- subset(train_table, ,train_vars)
correlation <- cor(train_table)
correlation <- correlation[, "RUL"]
correlation <- abs(correlation)
correlation <- correlation[order(correlation, decreasing = TRUE)]
correlation <- correlation[-1]
correlation <- correlation[1:35]
formula <- as.formula(paste(paste("RUL~"), paste(names(correlation), collapse = "+")))
####################################################################################################
## Import test into data frame for faster prediction and model evaluation
####################################################################################################
prediction_df <- read.table("test_norm_data.csv", header = TRUE, sep = ",")
####################################################################################################
## Regression model evaluation metrics
####################################################################################################
evaluate_model <- function(observed, predicted) {
mean_observed <- mean(observed)
se <- (observed - predicted)^2
ae <- abs(observed - predicted)
sem <- (observed - mean_observed)^2
aem <- abs(observed - mean_observed)
mae <- mean(ae)
rmse <- sqrt(mean(se))
rae <- sum(ae) / sum(aem)
rse <- sum(se) / sum(sem)
rsq <- 1 - rse
metrics <- c("Mean Absolute Error" = mae,
"Root Mean Squared Error" = rmse,
"Relative Absolute Error" = rae,
"Relative Squared Error" = rse,
"Coefficient of Determination" = rsq)
return(metrics)
}
####################################################################################################
## Decision forest modeling
####################################################################################################
library(party)
library(randomForest)
set.seed(5)
forest_model <- randomForest(formula, train_table, ntree = 8, nodesize = 32, myTry = 35)
forest_prediction <- predict(forest_model, prediction_df)
# forest_model <- rxDForest(formula = formula,
# data = train_table,
# nTree = 8,
# maxDepth = 32,
# mTry = 35,
# seed = 5)
# rxSetComputeContext(local)
# forest_prediction <- rxPredict(modelObject = forest_model,
# data = prediction_df,
# predVarNames = "Forest_Prediction",
# overwrite = TRUE)
forest_metrics <- evaluate_model(observed = prediction_df$RUL,
predicted = forest_prediction)
####################################################################################################
## Boosted tree modeling
####################################################################################################
require(gbm)
require(MASS)
boosted_model <- gbm(formula, train_table, shrinkage = 0.2,
cv.folds = 10, n.minobsinnode = 10, n.trees = 100, distribution = "gaussian")
boosted_prediction <- predict(boosted_model, prediction_df)
# boosted_model <- rxBTrees(formula = formula,
# data = train_table,
# learningRate = 0.2,
# minSplit = 10,
# minBucket = 10,
# nTree = 100,
# seed = 5,
# lossFunction = "gaussian")
# rxSetComputeContext(local)
# boosted_prediction <- rxPredict(modelObject = boosted_model,
# data = prediction_df,
# predVarNames = "Boosted_Prediction",
# overwrite = TRUE)
boosted_metrics <- evaluate_model(observed = prediction_df$RUL,
predicted = boosted_prediction)
####################################################################################################
## Poisson regression modeling
####################################################################################################
poisson_model <- glm(formula, train_table, family = poisson())
poisson_prediction <- predict(poisson_model, prediction_df)
# poisson_model <- rxGlm(formula = formula,
# data = train_table,
# family = poisson())
# rxSetComputeContext(local)
# poisson_prediction <- rxPredict(modelObject = poisson_model,
# data = prediction_df,
# predVarNames = "Poisson_Prediction",
# overwrite = TRUE)
names(poisson_prediction) <- "test"
poisson_metrics <- evaluate_model(observed = prediction_df$RUL,
predicted = poisson_prediction)
####################################################################################################
## Neural network regression modeling
####################################################################################################
library(nnet)
train_df <- train_table
max_train_rul <- max(train_df$RUL)
train_df$RUL <- train_df$RUL / max_train_rul
nodes <- 10
weights <- nodes * (35 + 1) + nodes + 1
nnet_model <- nnet(formula = formula,
data = train_df,
Wts = rep(0.1, weights),
size = nodes,
decay = 0.005,
MaxNWts = weights)
nnet_prediction <- predict(object = nnet_model,
newdata = prediction_df)
nnet_prediction <- nnet_prediction * max_train_rul
nnet_prediction <- as.data.frame(nnet_prediction)
names(nnet_prediction) <- "Nnet_Prediction"
nnet_metrics <- evaluate_model(observed = prediction_df$RUL,
predicted = nnet_prediction$Nnet_Prediction)
####################################################################################################
## Write test predictions to SQL
####################################################################################################
predictions <- cbind(prediction_df$id, prediction_df$cycle_orig, forest_prediction,
boosted_prediction, poisson_prediction, nnet_prediction)
colnames(predictions) <- c("id", "cycle", "Forest_Prediction", "Boosted_Prediction",
"Poisson_Prediction", "Nnet_Prediction")
write.table(predictions, file = "predictions.csv", sep = ",")
####################################################################################################
## Combine metrics and write to SQL
####################################################################################################
metrics_df <- rbind(forest_metrics, boosted_metrics, poisson_metrics, nnet_metrics)
metrics_df <- as.data.frame(metrics_df)
rownames(metrics_df) <- NULL
Algorithms <- c("Decision Forest",
"Boosted Decision Tree",
"Poisson Regression",
"Neural Network")
metrics_df <- cbind(Algorithms, metrics_df)
write.table(metrics_df, file = "model_evaluation.csv", sep = ",")
# metrics_table <- RxSqlServerData(table = "Regression_metrics",
# connectionString = connection_string)
# # rxDataStep(inData = metrics_df,
# outFile = metrics_table,
# overwrite = TRUE)
####################################################################################################
## Cleanup
####################################################################################################
rm(list = ls())