Skip to content

Instantly share code, notes, and snippets.

View BenHeubl's full-sized avatar

Ben Heubl BenHeubl

  • London
View GitHub Profile
model_random_forrest_optimal <- randomForest(INCOME ~ .,
data = TrainSet,
ntree = 500, mtry = 3,
importance = TRUE)
model_decision_RF = predict(model_random_forrest_optimal, data = TrainSet)
table(model_decision_RF, TrainSet$INCOME)
mean(model_decision_RF == TrainSet$INCOME)
#[1] 0.7757143
install.packages("rpart")
install.packages("caret")
install.packages("e1071")
library(rpart)
library(caret)
library(e1071)
model_decision_tree = train(INCOME ~ ., data = TrainSet, method = "rpart")
model_decision_tree_prediction = predict(model_decision_tree, data = TrainSet)
# Using For loop to identify the right mtry for model (this took around 4 minutes for me. Get yourself a drink :-)
accuracy_list =c()
for (i in 3:8) {
print(i)
model_optimal <- randomForest(INCOME ~ ., data = TrainSet, ntree = 500, mtry = i, importance = TRUE)
predValid <- predict(model_optimal, ValidSet, type = "class")
accuracy_list[i-2] = mean(predValid == ValidSet$INCOME)
}
income <- read.csv("https://raw.githubusercontent.com/selva86/datasets/master/income.csv")
incomeR <- income %>%
mutate(INCOME = if_else(INCOME == "-10.000)", "Under 30k",
if_else(INCOME == "[10.000–15.000)", "Under 30k",
if_else(INCOME == "[15.000–20.000)", "Under 30k",
if_else(INCOME == "[20.000–25.000)", "Under 30k",
if_else(INCOME == "[25.000–30.000)", "Under 30k", 'Over 30k')))))) %>% mutate_if(is.factor, fct_explicit_na, na_level = 'Unknown') %>%
mutate(INCOME = as.factor(INCOME))
#mode function
getmode <- function(v) {
uniqv <- unique(v)
uniqv[which.max(tabulate(match(v, uniqv)))]
}
incomeR_mode_income <- incomeR %>%
group_by(INCOME) %>%
summarise(mode = getmode(OCCUPATION))
# ggplotting our featuer importance:
Feature_importance <- importance(model_base)
var_Importance <- data.frame(Variables = row.names(Feature_importance),
Importance = round(importance[ ,'MeanDecreaseGini'],2))
#Create ranks for variable based on importance
Rank_Importance <- var_Importance %>%
mutate(Rank = paste0('#',dense_rank(desc(Importance))))
#Relative importance of our varaibles
# read in the data from github repo:
income <- read.csv("https://raw.githubusercontent.com/selva86/datasets/master/income.csv")
set.seed(100)
# We shuffle row-wise:
incomeR <- income[sample(nrow(income)),]
#check rownames (see above screenshot)
colnames(incomeR)
install.packages("relaimpo")
library(relaimpo)
#fit linear model:
Ozone_model <- lm(ozone_reading ~ . , data = Ozone)
#Get relative importance:
Relative_importance <- calc.relimp(lmMod, type = "lmg", rela = TRUE)
# Relative importance scaled to 100 and plot:
install.packages('PerformanceAnalytics')
library(PerformanceAnalytics)
chart.Correlation(Ozone, histogram=TRUE, pch=19)
#read in data
Ozone <- read.csv("https://raw.githubusercontent.com/selva86/datasets/master/ozone.csv", stringsAsFactors=F)