Ben Heubl BenHeubl

## gist:b73bca173442fc6e1dd929961aca9fc3
model_random_forrest_optimal <- randomForest(INCOME ~ .,
data = TrainSet,
ntree = 500, mtry = 3,
importance = TRUE)

model_decision_RF = predict(model_random_forrest_optimal, data = TrainSet)
table(model_decision_RF, TrainSet$INCOME)
mean(model_decision_RF == TrainSet$INCOME)

#[1] 0.7757143

## gist:f7ba8d25e2f04600f998ab5c65a560f5
install.packages("rpart")
install.packages("caret")
install.packages("e1071")
library(rpart)
library(caret)
library(e1071)

model_decision_tree = train(INCOME ~ ., data = TrainSet, method = "rpart")

model_decision_tree_prediction = predict(model_decision_tree, data = TrainSet)

## gist:7dc11574812c895e9e6cbddbfaf0f88b
# Using For loop to identify the right mtry for model (this took around 4 minutes for me. Get yourself a drink :-)

accuracy_list =c()
for (i in 3:8) {
print(i)
model_optimal <- randomForest(INCOME ~ ., data = TrainSet, ntree = 500, mtry = i, importance = TRUE)
predValid <- predict(model_optimal, ValidSet, type = "class")
accuracy_list[i-2] = mean(predValid == ValidSet$INCOME)
}

## gist:d6966b766a23323bbf1f112cfd682c0c
income <- read.csv("https://raw.githubusercontent.com/selva86/datasets/master/income.csv")

incomeR <- income %>%
mutate(INCOME = if_else(INCOME == "-10.000)", "Under 30k",
if_else(INCOME == "[10.000–15.000)", "Under 30k",
if_else(INCOME == "[15.000–20.000)", "Under 30k",
if_else(INCOME == "[20.000–25.000)", "Under 30k",
if_else(INCOME == "[25.000–30.000)", "Under 30k", 'Over 30k')))))) %>% mutate_if(is.factor, fct_explicit_na, na_level = 'Unknown') %>%
mutate(INCOME = as.factor(INCOME))

## gist:de57feb16d8f5fd127df283242d0c1f8
#mode function
getmode <- function(v) {
 uniqv <- unique(v)
 uniqv[which.max(tabulate(match(v, uniqv)))]
}

incomeR_mode_income <- incomeR %>%
 group_by(INCOME) %>%
 summarise(mode = getmode(OCCUPATION))


## gist:ac732c97a074875e62d8b7d017d67dc8
# ggplotting our featuer importance:
Feature_importance <- importance(model_base)
var_Importance <- data.frame(Variables = row.names(Feature_importance),
Importance = round(importance[ ,'MeanDecreaseGini'],2))

#Create ranks for variable based on importance
Rank_Importance <- var_Importance %>%
mutate(Rank = paste0('#',dense_rank(desc(Importance))))

#Relative importance of our varaibles

## gist:1bd9cbf9383239559032d257e9f8fe28
# read in the data from github repo:
income <- read.csv("https://raw.githubusercontent.com/selva86/datasets/master/income.csv")

set.seed(100)

# We shuffle row-wise:
incomeR <- income[sample(nrow(income)),]

#check rownames (see above screenshot)
colnames(incomeR)

## gist:652fba89132ee4b4a94e3c2ab2d46867
install.packages("relaimpo")
library(relaimpo)

#fit linear model:
Ozone_model <- lm(ozone_reading ~ . , data = Ozone)

#Get relative importance:
Relative_importance <- calc.relimp(lmMod, type = "lmg", rela = TRUE)

# Relative importance scaled to 100 and plot:

## gist:71e6b33710ffed3f19707dfdf5b80cc8
install.packages('PerformanceAnalytics')
library(PerformanceAnalytics)

chart.Correlation(Ozone, histogram=TRUE, pch=19)

## gist:019d4c6304ba8ea423b9188191fed11c
#read in data
Ozone <- read.csv("https://raw.githubusercontent.com/selva86/datasets/master/ozone.csv", stringsAsFactors=F)
	model_random_forrest_optimal <- randomForest(INCOME ~ .,
	data = TrainSet,
	ntree = 500, mtry = 3,
	importance = TRUE)

	model_decision_RF = predict(model_random_forrest_optimal, data = TrainSet)
	table(model_decision_RF, TrainSet$INCOME)
	mean(model_decision_RF == TrainSet$INCOME)

	#[1] 0.7757143
	install.packages("rpart")
	install.packages("caret")
	install.packages("e1071")
	library(rpart)
	library(caret)
	library(e1071)

	model_decision_tree = train(INCOME ~ ., data = TrainSet, method = "rpart")

	model_decision_tree_prediction = predict(model_decision_tree, data = TrainSet)
	# Using For loop to identify the right mtry for model (this took around 4 minutes for me. Get yourself a drink :-)

	accuracy_list =c()
	for (i in 3:8) {
	print(i)
	model_optimal <- randomForest(INCOME ~ ., data = TrainSet, ntree = 500, mtry = i, importance = TRUE)
	predValid <- predict(model_optimal, ValidSet, type = "class")
	accuracy_list[i-2] = mean(predValid == ValidSet$INCOME)
	}
	income <- read.csv("https://raw.githubusercontent.com/selva86/datasets/master/income.csv")

	incomeR <- income %>%
	mutate(INCOME = if_else(INCOME == "-10.000)", "Under 30k",
	if_else(INCOME == "[10.000–15.000)", "Under 30k",
	if_else(INCOME == "[15.000–20.000)", "Under 30k",
	if_else(INCOME == "[20.000–25.000)", "Under 30k",
	if_else(INCOME == "[25.000–30.000)", "Under 30k", 'Over 30k')))))) %>% mutate_if(is.factor, fct_explicit_na, na_level = 'Unknown') %>%
	mutate(INCOME = as.factor(INCOME))
	#mode function
	getmode <- function(v) {
	uniqv <- unique(v)
	uniqv[which.max(tabulate(match(v, uniqv)))]
	}

	incomeR_mode_income <- incomeR %>%
	group_by(INCOME) %>%
	summarise(mode = getmode(OCCUPATION))
	# ggplotting our featuer importance:
	Feature_importance <- importance(model_base)
	var_Importance <- data.frame(Variables = row.names(Feature_importance),
	Importance = round(importance[ ,'MeanDecreaseGini'],2))

	#Create ranks for variable based on importance
	Rank_Importance <- var_Importance %>%
	mutate(Rank = paste0('#',dense_rank(desc(Importance))))

	#Relative importance of our varaibles
	# read in the data from github repo:
	income <- read.csv("https://raw.githubusercontent.com/selva86/datasets/master/income.csv")

	set.seed(100)

	# We shuffle row-wise:
	incomeR <- income[sample(nrow(income)),]

	#check rownames (see above screenshot)
	colnames(incomeR)
	install.packages("relaimpo")
	library(relaimpo)

	#fit linear model:
	Ozone_model <- lm(ozone_reading ~ . , data = Ozone)

	#Get relative importance:
	Relative_importance <- calc.relimp(lmMod, type = "lmg", rela = TRUE)

	# Relative importance scaled to 100 and plot:
	install.packages('PerformanceAnalytics')
	library(PerformanceAnalytics)

	chart.Correlation(Ozone, histogram=TRUE, pch=19)
	#read in data
	Ozone <- read.csv("https://raw.githubusercontent.com/selva86/datasets/master/ozone.csv", stringsAsFactors=F)