Ben Heubl BenHeubl

## data2.csv
Keys,DeviceName,Uniqueness,Benefits,Website Link,DeviceDescription
246,3L Labs Footlogger,9,28,http://vandrico.com/node/246,"The 3L Labs Footlogger is a wearable fitness tracking device that aims at spotting health problems early, as well as logging daily activity. Placed in the user's shoe, 8 sensors coupled to 1 accelerometer help identify and record exercise habits. The data is then disclosed to the user via text or smartphone app. This computing device's technology can be used for athletes training, regular everyday workouts and rehabilitation."
387,4D Force,5,13,http://vandrico.com/node/387,"The 4D Force is a wearable technology that detects brain waves and converts them into electric signals. 4D Force developed a platform that can capture and compute high quality EEG/ EOG/EMG signals. With the device, users can control games by using the power of their thoughts. 4D Force can also be used for medical purposes as it has the ability to interpret electrical signals generated by the body, and create recommen

## gist:15b1f47417d7da85b7c90af989aa126a
boruta_signif <-names(boruta_output_Boruta$finalDecision[boruta_output_Boruta$finalDecision %in% c("Confirmed", "Tentative")])
print(boruta_signif)

#Plot:
plot(boruta_output_Boruta, cex.axis=.7, las=2, xlab="", main="Variable Importance")

## gist:0d639b93975506355e3563f93107c41b
Brexit_test$pred <- predict(Brexit_model_final, Brexit_test, type = "class")
base_accuracy <- mean(Brexit_test$pred == Brexit_test$Percent.Leave)

base_accuracy

## gist:b73bca173442fc6e1dd929961aca9fc3
model_random_forrest_optimal <- randomForest(INCOME ~ .,
data = TrainSet,
ntree = 500, mtry = 3,
importance = TRUE)

model_decision_RF = predict(model_random_forrest_optimal, data = TrainSet)
table(model_decision_RF, TrainSet$INCOME)
mean(model_decision_RF == TrainSet$INCOME)

#[1] 0.7757143

## gist:f7ba8d25e2f04600f998ab5c65a560f5
install.packages("rpart")
install.packages("caret")
install.packages("e1071")
library(rpart)
library(caret)
library(e1071)

model_decision_tree = train(INCOME ~ ., data = TrainSet, method = "rpart")

model_decision_tree_prediction = predict(model_decision_tree, data = TrainSet)

## gist:7dc11574812c895e9e6cbddbfaf0f88b
# Using For loop to identify the right mtry for model (this took around 4 minutes for me. Get yourself a drink :-)

accuracy_list =c()
for (i in 3:8) {
print(i)
model_optimal <- randomForest(INCOME ~ ., data = TrainSet, ntree = 500, mtry = i, importance = TRUE)
predValid <- predict(model_optimal, ValidSet, type = "class")
accuracy_list[i-2] = mean(predValid == ValidSet$INCOME)
}

## gist:d6966b766a23323bbf1f112cfd682c0c
income <- read.csv("https://raw.githubusercontent.com/selva86/datasets/master/income.csv")

incomeR <- income %>%
mutate(INCOME = if_else(INCOME == "-10.000)", "Under 30k",
if_else(INCOME == "[10.000–15.000)", "Under 30k",
if_else(INCOME == "[15.000–20.000)", "Under 30k",
if_else(INCOME == "[20.000–25.000)", "Under 30k",
if_else(INCOME == "[25.000–30.000)", "Under 30k", 'Over 30k')))))) %>% mutate_if(is.factor, fct_explicit_na, na_level = 'Unknown') %>%
mutate(INCOME = as.factor(INCOME))

## gist:de57feb16d8f5fd127df283242d0c1f8
#mode function
getmode <- function(v) {
 uniqv <- unique(v)
 uniqv[which.max(tabulate(match(v, uniqv)))]
}

incomeR_mode_income <- incomeR %>%
 group_by(INCOME) %>%
 summarise(mode = getmode(OCCUPATION))


## gist:ac732c97a074875e62d8b7d017d67dc8
# ggplotting our featuer importance:
Feature_importance <- importance(model_base)
var_Importance <- data.frame(Variables = row.names(Feature_importance),
Importance = round(importance[ ,'MeanDecreaseGini'],2))

#Create ranks for variable based on importance
Rank_Importance <- var_Importance %>%
mutate(Rank = paste0('#',dense_rank(desc(Importance))))

#Relative importance of our varaibles

## gist:1bd9cbf9383239559032d257e9f8fe28
# read in the data from github repo:
income <- read.csv("https://raw.githubusercontent.com/selva86/datasets/master/income.csv")

set.seed(100)

# We shuffle row-wise:
incomeR <- income[sample(nrow(income)),]

#check rownames (see above screenshot)
colnames(incomeR)
	Keys,DeviceName,Uniqueness,Benefits,Website Link,DeviceDescription
	246,3L Labs Footlogger,9,28,http://vandrico.com/node/246,"The 3L Labs Footlogger is a wearable fitness tracking device that aims at spotting health problems early, as well as logging daily activity. Placed in the user's shoe, 8 sensors coupled to 1 accelerometer help identify and record exercise habits. The data is then disclosed to the user via text or smartphone app. This computing device's technology can be used for athletes training, regular everyday workouts and rehabilitation."
	387,4D Force,5,13,http://vandrico.com/node/387,"The 4D Force is a wearable technology that detects brain waves and converts them into electric signals. 4D Force developed a platform that can capture and compute high quality EEG/ EOG/EMG signals. With the device, users can control games by using the power of their thoughts. 4D Force can also be used for medical purposes as it has the ability to interpret electrical signals generated by the body, and create recommen
	boruta_signif <-names(boruta_output_Boruta$finalDecision[boruta_output_Boruta$finalDecision %in% c("Confirmed", "Tentative")])
	print(boruta_signif)

	#Plot:
	plot(boruta_output_Boruta, cex.axis=.7, las=2, xlab="", main="Variable Importance")
	Brexit_test$pred <- predict(Brexit_model_final, Brexit_test, type = "class")
	base_accuracy <- mean(Brexit_test$pred == Brexit_test$Percent.Leave)

	base_accuracy
	model_random_forrest_optimal <- randomForest(INCOME ~ .,
	data = TrainSet,
	ntree = 500, mtry = 3,
	importance = TRUE)

	model_decision_RF = predict(model_random_forrest_optimal, data = TrainSet)
	table(model_decision_RF, TrainSet$INCOME)
	mean(model_decision_RF == TrainSet$INCOME)

	#[1] 0.7757143
	install.packages("rpart")
	install.packages("caret")
	install.packages("e1071")
	library(rpart)
	library(caret)
	library(e1071)

	model_decision_tree = train(INCOME ~ ., data = TrainSet, method = "rpart")

	model_decision_tree_prediction = predict(model_decision_tree, data = TrainSet)
	# Using For loop to identify the right mtry for model (this took around 4 minutes for me. Get yourself a drink :-)

	accuracy_list =c()
	for (i in 3:8) {
	print(i)
	model_optimal <- randomForest(INCOME ~ ., data = TrainSet, ntree = 500, mtry = i, importance = TRUE)
	predValid <- predict(model_optimal, ValidSet, type = "class")
	accuracy_list[i-2] = mean(predValid == ValidSet$INCOME)
	}
	income <- read.csv("https://raw.githubusercontent.com/selva86/datasets/master/income.csv")

	incomeR <- income %>%
	mutate(INCOME = if_else(INCOME == "-10.000)", "Under 30k",
	if_else(INCOME == "[10.000–15.000)", "Under 30k",
	if_else(INCOME == "[15.000–20.000)", "Under 30k",
	if_else(INCOME == "[20.000–25.000)", "Under 30k",
	if_else(INCOME == "[25.000–30.000)", "Under 30k", 'Over 30k')))))) %>% mutate_if(is.factor, fct_explicit_na, na_level = 'Unknown') %>%
	mutate(INCOME = as.factor(INCOME))
	#mode function
	getmode <- function(v) {
	uniqv <- unique(v)
	uniqv[which.max(tabulate(match(v, uniqv)))]
	}

	incomeR_mode_income <- incomeR %>%
	group_by(INCOME) %>%
	summarise(mode = getmode(OCCUPATION))
	# ggplotting our featuer importance:
	Feature_importance <- importance(model_base)
	var_Importance <- data.frame(Variables = row.names(Feature_importance),
	Importance = round(importance[ ,'MeanDecreaseGini'],2))

	#Create ranks for variable based on importance
	Rank_Importance <- var_Importance %>%
	mutate(Rank = paste0('#',dense_rank(desc(Importance))))

	#Relative importance of our varaibles
	# read in the data from github repo:
	income <- read.csv("https://raw.githubusercontent.com/selva86/datasets/master/income.csv")

	set.seed(100)

	# We shuffle row-wise:
	incomeR <- income[sample(nrow(income)),]

	#check rownames (see above screenshot)
	colnames(incomeR)