Skip to content

Instantly share code, notes, and snippets.

View karan19100's full-sized avatar

Karan Shah karan19100

View GitHub Profile
Describe customers
@karan19100
karan19100 / step4.R
Created July 18, 2020 21:13
step4_predicting_linear_model_by_karan_shah
# K-Cross Validation
kfold <- CVlm(data = newdata2, form.lm = formula(HEMATOCRIT ~ HEMAGLOBIN), m=5,
dots = FALSE, seed=123, legend.pos="topleft",
main="Cross Validation; k=5",
plotit=TRUE, printit=FALSE)
# The mean squared error measures how a regression line is close to a set of points
attr(kfold, 'ms')
@karan19100
karan19100 / step3.R
Created July 18, 2020 21:04
step3_predicting_linear_model_by_karanshah
act_pred <- data.frame(cbind(actuals=testData$HEMATOCRIT, predicteds=predict)) # actuals_predicteds
cor(act_pred) # correlation_accuracy
head(act_pred, n=10)
# Actual values and predicted ones seem very close to each other. A good metric to see how much they are close is the min-max accuracy, that considers the average between the minimum and the maximum prediction.
min_max <- mean(apply(act_pred, 1, min) / apply(act_pred, 1, max))
print(min_max) # show the result
mape <- mean(abs((act_pred$predicteds - act_pred$actuals))/act_pred$actuals)
@karan19100
karan19100 / step2.R
Created July 18, 2020 20:58
step2_predicting_linear_model_by_karanshah
modTrain <- lm(HEMATOCRIT ~ HEMAGLOBIN, data=trainingData) # build the model
predict <- predict(modTrain, testData) # predicted values
summary(modTrain)
@karan19100
karan19100 / step1.R
Created July 18, 2020 20:56
step1_for_predicting_linear_model_by_karanshah
set.seed(123) # setting seed to reproduce results of random sampling
trainingRowIndex <- sample(1:nrow(newdata2), 0.7*nrow(newdata2)) # training and testing: 70/30 split
trainingData <- newdata2[trainingRowIndex, ] # training data
testData <- newdata2[-trainingRowIndex, ] # test data
@karan19100
karan19100 / Diagnostic_plots.R
Created July 18, 2020 20:41
Diagnostic_plots_by_karanshah
par(mfrow = c(2,2)) # display a unique layout for all graphs
plot(mod2)
@karan19100
karan19100 / new_model.R
Created July 18, 2020 20:33
new_model_by_karanshah
mod2 = lm(HEMATOCRIT ~ HEMAGLOBIN_CENT, data = newdata2)
summary(mod2)
@karan19100
karan19100 / new_dataset.R
Created July 18, 2020 20:32
new_dataset_by_karanshah
newdata2 <- subset(newdata1, OBS != 159 & OBS != 166 & OBS != 169,
select=c(HEMAGLOBIN, HEMATOCRIT))
HEMAGLOBIN_CENT = scale(newdata2$HEMAGLOBIN, center=TRUE, scale=FALSE) # center the variable
@karan19100
karan19100 / model_improvement.R
Created July 18, 2020 20:29
model_improvement_by_karanshah
newdata1 <- setNames(cbind(rownames(newdata), newdata, row.names = NULL),
c("OBS", "HEMAGLOBIN", "HEMATOCRIT"))
newdata1$OUTLIER = ifelse(newdata1$OBS %in% c(159,166,169),"Y","N") # create condition Yes/No if outlier
qplot(HEMATOCRIT, HEMAGLOBIN, data = newdata1, colour = OUTLIER,
main = "HEMAGLOBIN and HEMATOCRIT relationship") +
theme(plot.title = element_text(hjust = 0.5)) +
scale_y_continuous(breaks = c(30:65), minor_breaks = NULL) +
scale_x_continuous(breaks = c(10:25), minor_breaks = NULL)
@karan19100
karan19100 / f_statistic.R
Created July 18, 2020 20:20
f_statistic_by_karanshah
f_statistic <- mod1$fstatistic[1] # calculate F statistic
f <- summary(mod1)$fstatistic # parameters for model p-value calculation
print(f) # print F value