 Describe customers
Created Jul 18, 2020
step4_predicting_linear_model_by_karan_shah
 # K-Cross Validation kfold <- CVlm(data = newdata2, form.lm = formula(HEMATOCRIT ~ HEMAGLOBIN), m=5, dots = FALSE, seed=123, legend.pos="topleft", main="Cross Validation; k=5", plotit=TRUE, printit=FALSE) # The mean squared error measures how a regression line is close to a set of points attr(kfold, 'ms')
Created Jul 18, 2020
step3_predicting_linear_model_by_karanshah
 act_pred <- data.frame(cbind(actuals=testData\$HEMATOCRIT, predicteds=predict)) # actuals_predicteds cor(act_pred) # correlation_accuracy head(act_pred, n=10) # Actual values and predicted ones seem very close to each other. A good metric to see how much they are close is the min-max accuracy, that considers the average between the minimum and the maximum prediction. min_max <- mean(apply(act_pred, 1, min) / apply(act_pred, 1, max)) print(min_max) # show the result mape <- mean(abs((act_pred\$predicteds - act_pred\$actuals))/act_pred\$actuals)
Created Jul 18, 2020
step2_predicting_linear_model_by_karanshah
 modTrain <- lm(HEMATOCRIT ~ HEMAGLOBIN, data=trainingData) # build the model predict <- predict(modTrain, testData) # predicted values summary(modTrain)
Created Jul 18, 2020
step1_for_predicting_linear_model_by_karanshah
 set.seed(123) # setting seed to reproduce results of random sampling trainingRowIndex <- sample(1:nrow(newdata2), 0.7*nrow(newdata2)) # training and testing: 70/30 split trainingData <- newdata2[trainingRowIndex, ] # training data testData <- newdata2[-trainingRowIndex, ] # test data
Created Jul 18, 2020
Diagnostic_plots_by_karanshah
 par(mfrow = c(2,2)) # display a unique layout for all graphs plot(mod2)
Created Jul 18, 2020
new_model_by_karanshah
 mod2 = lm(HEMATOCRIT ~ HEMAGLOBIN_CENT, data = newdata2) summary(mod2)
Created Jul 18, 2020
new_dataset_by_karanshah
 newdata2 <- subset(newdata1, OBS != 159 & OBS != 166 & OBS != 169, select=c(HEMAGLOBIN, HEMATOCRIT)) HEMAGLOBIN_CENT = scale(newdata2\$HEMAGLOBIN, center=TRUE, scale=FALSE) # center the variable
Created Jul 18, 2020
model_improvement_by_karanshah
 newdata1 <- setNames(cbind(rownames(newdata), newdata, row.names = NULL), c("OBS", "HEMAGLOBIN", "HEMATOCRIT")) newdata1\$OUTLIER = ifelse(newdata1\$OBS %in% c(159,166,169),"Y","N") # create condition Yes/No if outlier qplot(HEMATOCRIT, HEMAGLOBIN, data = newdata1, colour = OUTLIER, main = "HEMAGLOBIN and HEMATOCRIT relationship") + theme(plot.title = element_text(hjust = 0.5)) + scale_y_continuous(breaks = c(30:65), minor_breaks = NULL) + scale_x_continuous(breaks = c(10:25), minor_breaks = NULL)
Created Jul 18, 2020
f_statistic_by_karanshah
 f_statistic <- mod1\$fstatistic[1] # calculate F statistic f <- summary(mod1)\$fstatistic # parameters for model p-value calculation print(f) # print F value