Last active
March 15, 2018 16:50
-
-
Save cytms/66931344faf68f41326a7c8e578736bc to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#################################################################################################### | |
## Training regression models to answer questions on how many cycles will be left for an engine | |
## Four models will be trained: | |
## 1. Decision forest; | |
## 2. Boosted decision tree; | |
## 3. Poison regression modeling; | |
## 4. Neural network | |
## Input : The processed train (train_norm_data.csv) and test (test_norm_data.csv) dataset | |
## Output: The results on test dataset(predictions.csv) and the model evaluation metrics(model_evalutation.csv) | |
## edited by @cytms, time spent: 2 hours | |
#################################################################################################### | |
setwd("~/Projects/m-project") | |
train_table <- read.table("train_norm_data.csv", header = TRUE, sep = ",") | |
#################################################################################################### | |
## Find top 35 variables most correlated with RUL | |
#################################################################################################### | |
train_vars <- names(train_table) | |
train_vars <- train_vars[!train_vars %in% c("label1", "label2", "id", "cycle_orig")] | |
train_table <- subset(train_table, ,train_vars) | |
correlation <- cor(train_table) | |
correlation <- correlation[, "RUL"] | |
correlation <- abs(correlation) | |
correlation <- correlation[order(correlation, decreasing = TRUE)] | |
correlation <- correlation[-1] | |
correlation <- correlation[1:35] | |
formula <- as.formula(paste(paste("RUL~"), paste(names(correlation), collapse = "+"))) | |
#################################################################################################### | |
## Import test into data frame for faster prediction and model evaluation | |
#################################################################################################### | |
prediction_df <- read.table("test_norm_data.csv", header = TRUE, sep = ",") | |
#################################################################################################### | |
## Regression model evaluation metrics | |
#################################################################################################### | |
evaluate_model <- function(observed, predicted) { | |
mean_observed <- mean(observed) | |
se <- (observed - predicted)^2 | |
ae <- abs(observed - predicted) | |
sem <- (observed - mean_observed)^2 | |
aem <- abs(observed - mean_observed) | |
mae <- mean(ae) | |
rmse <- sqrt(mean(se)) | |
rae <- sum(ae) / sum(aem) | |
rse <- sum(se) / sum(sem) | |
rsq <- 1 - rse | |
metrics <- c("Mean Absolute Error" = mae, | |
"Root Mean Squared Error" = rmse, | |
"Relative Absolute Error" = rae, | |
"Relative Squared Error" = rse, | |
"Coefficient of Determination" = rsq) | |
return(metrics) | |
} | |
#################################################################################################### | |
## Decision forest modeling | |
#################################################################################################### | |
library(party) | |
library(randomForest) | |
set.seed(5) | |
forest_model <- randomForest(formula, train_table, ntree = 8, nodesize = 32, myTry = 35) | |
forest_prediction <- predict(forest_model, prediction_df) | |
# forest_model <- rxDForest(formula = formula, | |
# data = train_table, | |
# nTree = 8, | |
# maxDepth = 32, | |
# mTry = 35, | |
# seed = 5) | |
# rxSetComputeContext(local) | |
# forest_prediction <- rxPredict(modelObject = forest_model, | |
# data = prediction_df, | |
# predVarNames = "Forest_Prediction", | |
# overwrite = TRUE) | |
forest_metrics <- evaluate_model(observed = prediction_df$RUL, | |
predicted = forest_prediction) | |
#################################################################################################### | |
## Boosted tree modeling | |
#################################################################################################### | |
require(gbm) | |
require(MASS) | |
boosted_model <- gbm(formula, train_table, shrinkage = 0.2, | |
cv.folds = 10, n.minobsinnode = 10, n.trees = 100, distribution = "gaussian") | |
boosted_prediction <- predict(boosted_model, prediction_df) | |
# boosted_model <- rxBTrees(formula = formula, | |
# data = train_table, | |
# learningRate = 0.2, | |
# minSplit = 10, | |
# minBucket = 10, | |
# nTree = 100, | |
# seed = 5, | |
# lossFunction = "gaussian") | |
# rxSetComputeContext(local) | |
# boosted_prediction <- rxPredict(modelObject = boosted_model, | |
# data = prediction_df, | |
# predVarNames = "Boosted_Prediction", | |
# overwrite = TRUE) | |
boosted_metrics <- evaluate_model(observed = prediction_df$RUL, | |
predicted = boosted_prediction) | |
#################################################################################################### | |
## Poisson regression modeling | |
#################################################################################################### | |
poisson_model <- glm(formula, train_table, family = poisson()) | |
poisson_prediction <- predict(poisson_model, prediction_df) | |
# poisson_model <- rxGlm(formula = formula, | |
# data = train_table, | |
# family = poisson()) | |
# rxSetComputeContext(local) | |
# poisson_prediction <- rxPredict(modelObject = poisson_model, | |
# data = prediction_df, | |
# predVarNames = "Poisson_Prediction", | |
# overwrite = TRUE) | |
names(poisson_prediction) <- "test" | |
poisson_metrics <- evaluate_model(observed = prediction_df$RUL, | |
predicted = poisson_prediction) | |
#################################################################################################### | |
## Neural network regression modeling | |
#################################################################################################### | |
library(nnet) | |
train_df <- train_table | |
max_train_rul <- max(train_df$RUL) | |
train_df$RUL <- train_df$RUL / max_train_rul | |
nodes <- 10 | |
weights <- nodes * (35 + 1) + nodes + 1 | |
nnet_model <- nnet(formula = formula, | |
data = train_df, | |
Wts = rep(0.1, weights), | |
size = nodes, | |
decay = 0.005, | |
MaxNWts = weights) | |
nnet_prediction <- predict(object = nnet_model, | |
newdata = prediction_df) | |
nnet_prediction <- nnet_prediction * max_train_rul | |
nnet_prediction <- as.data.frame(nnet_prediction) | |
names(nnet_prediction) <- "Nnet_Prediction" | |
nnet_metrics <- evaluate_model(observed = prediction_df$RUL, | |
predicted = nnet_prediction$Nnet_Prediction) | |
#################################################################################################### | |
## Write test predictions to SQL | |
#################################################################################################### | |
predictions <- cbind(prediction_df$id, prediction_df$cycle_orig, forest_prediction, | |
boosted_prediction, poisson_prediction, nnet_prediction) | |
colnames(predictions) <- c("id", "cycle", "Forest_Prediction", "Boosted_Prediction", | |
"Poisson_Prediction", "Nnet_Prediction") | |
write.table(predictions, file = "predictions.csv", sep = ",") | |
#################################################################################################### | |
## Combine metrics and write to SQL | |
#################################################################################################### | |
metrics_df <- rbind(forest_metrics, boosted_metrics, poisson_metrics, nnet_metrics) | |
metrics_df <- as.data.frame(metrics_df) | |
rownames(metrics_df) <- NULL | |
Algorithms <- c("Decision Forest", | |
"Boosted Decision Tree", | |
"Poisson Regression", | |
"Neural Network") | |
metrics_df <- cbind(Algorithms, metrics_df) | |
write.table(metrics_df, file = "model_evaluation.csv", sep = ",") | |
# metrics_table <- RxSqlServerData(table = "Regression_metrics", | |
# connectionString = connection_string) | |
# # rxDataStep(inData = metrics_df, | |
# outFile = metrics_table, | |
# overwrite = TRUE) | |
#################################################################################################### | |
## Cleanup | |
#################################################################################################### | |
rm(list = ls()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
References
https://docs.microsoft.com/en-us/machine-learning-server/r-reference/revoscaler/rxdforest
https://www.rdocumentation.org/packages/randomForest/versions/4.6-12/topics/randomForest
https://docs.microsoft.com/en-us/machine-learning-server/r-reference/revoscaler/rxbtrees
https://www.rdocumentation.org/packages/gbm/versions/2.1.1/topics/gbm
https://docs.microsoft.com/en-us/machine-learning-server/r-reference/revoscaler/rxglm
https://www.rdocumentation.org/packages/stats/versions/3.4.3/topics/glm
https://www.rdocumentation.org/packages/nnet/versions/7.3-12/topics/nnet