daroczig/BCE-MDDA-2018.R

## BCE-MDDA-2018.R
## #############################################################################
## PCA demo on image processing
## #############################################################################

download.file('http://bit.ly/nasa-image-pca', 'image.jpg') # mode = »bw«

library(jpeg)
img <- readJPEG('image.jpg')
str(img)

dim(img)
h <- dim(img)[1]
w <- dim(img)[2]

img1d <- matrix(img, h * w)
pca <- prcomp(img1d)
pca
summary(pca)

image(matrix(pca$x[, 1], h))
image(matrix(pca$x[, 2], h))
image(matrix(pca$x[, 2], h), col = gray.colors(100))

image(matrix(pca$x[, 3], h))

## #############################################################################
## MDS
## #############################################################################

download.file('http://bit.ly/hun-cities-distance', 'cities.xls') # mode = »bw«

library(readxl)
cities <- read_excel('cities .xls')

## get rid of 1st column and last row (metadata)
cities <- cities[, -1]
cities <- cities[-nrow(cities), ]

mds <- cmdscale(as.dist(cities))

plot(mds)
text(mds[, 1], mds[, 2], tail(names(cities), -1))

mds <- -mds
plot(mds)
text(mds[, 1], mds[, 2], tail(names(cities), -1))

mds <- cmdscale(dist(mtcars))
plot(mds)
text(mds[, 1], mds[, 2], rownames(mtcars))

mds <- as.data.frame(mds)
library(ggplot2) # start with geom_point
ggplot(mds, aes(V1, -V2, label = rownames(mtcars))) + geom_text()

library(ggrepel)
ggplot(mds, aes(V1, -V2, label = rownames(mtcars))) + geom_text_repel()

## #############################################################################
## feature selection
## #############################################################################

str(iris)
plot(iris$Sepal.Length, iris$Sepal.Width)

fit <- lm(Sepal.Width ~ Sepal.Length, data = iris)
fit
summary(fit)

plot(iris$Sepal.Length, iris$Sepal.Width)
abline(fit, col = 'red', lwd = 5)

## now try
plot(iris$Sepal.Length, iris$Sepal.Width, col = iris$Species)
summary(lm(Sepal.Width ~ Sepal.Length + Species, data = iris))

ggplot(iris, aes(Sepal.Length, Sepal.Width, col = Species)) +
  geom_point() + geom_smooth(method = 'lm')

ggplot(iris, aes(Sepal.Length, Sepal.Width)) +
  geom_point(aes(col = Species)) +
  geom_smooth(aes(col = Species), method = 'lm') +
  geom_smooth(method = 'lm', col = 'black') + theme_bw()

## #############################################################################
## PCA demo to show the steps in hierarchical clustering
## #############################################################################

prcomp(iris[, 1:4])
PC <- prcomp(iris[, 1:4])$x
dm <- dist(iris[, 1:4])
hc <- hclust(dm)
plot(hc)

for (i in 2:8) {
  plot(hc)
  rect.hclust(hc, k = i, border = 'red')
  Sys.sleep(1)
}

library(animation)
ani.options(interval = 1)
saveGIF({
  for (i in 2:8) {
    plot(hc)
    rect.hclust(hc, k = i, border = 'red')
  }
})

library(animation)
ani.options(interval = 1)
saveGIF({
  for (i in 1:8) {
    plot(-PC[, 1:2], col = cutree(hc, i), pch = as.numeric(factor(iris$Species)) + 5)
  }
})

## #############################################################################
## basic decision tress
## #############################################################################

rm(iris)

set.seed(100)
mysample <- sample(1:150, 100)
str(mysample)

train <- iris[mysample, ]
test  <- iris[setdiff(1:150, mysample), ]

library(rpart)
ct <- rpart(Species ~ ., data = train)
summary(ct)

plot(ct); text(ct)

library(partykit)
plot(as.party(ct))

predict(ct, newdata = test)
predict(ct, newdata = test, type = 'class')

## confusion matrix
table(test$Species,
      predict(ct, newdata = test,
              type = 'class'))

ct <- rpart(Species ~ ., data = train,
            minsplit = 1, cp = 0.001)
## ?rpart.control
plot(ct); text(ct)
plot(as.party(ct))

table(test$Species,
      predict(ct, newdata = test,
              type = 'class'))

## back to slides on high level overview on bagging, random forest, gbm etc

## #############################################################################
## H2O
## #############################################################################

## start and connect to H2O
library(h2o)
h2o.init()
## in a browser check http://localhost:54321

rpart()
h2o.randomForest()

## demo data
library(hflights)
rm(hflights)
str(hflights)

## copy to H2O
hflights.hex <- as.h2o(hflights, 'hflights')

str(hflights.hex)
str(head(hflights.hex))
head(hflights.hex, 3)
summary(hflights.hex)
## check flight number: numeric?
## check on H2O web interface as well (enum vs factor)

## convert numeric to factor/enum
hflights.hex[, 'FlightNum'] <- as.factor(hflights.hex[, 'FlightNum'])
summary(hflights.hex)

## boring
for (v in c('Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'ArrTime', 'Cancelled')) {
    hflights.hex[, v] <- as.factor(hflights.hex[, v])
}

summary(hflights.hex)

## feature engineering: departure time? is it OK? hour of the day?
## redo everything... just use the R script
library(data.table)
hflights <- data.table(hflights)
hflights[, hour := substr(DepTime, 1, 2)]
hflights[, .N, by = hour]

hflights[, hour := substr(DepTime, 1, nchar(DepTime) - 2)]
hflights[, .N, by = hour][order(hour)]
hflights[hour == '']

hflights[, hour := cut(as.numeric(hour), seq(0, 24, 4))]
hflights[, .N, by = hour]

hflights[is.na(hour)]
hflights[, .N, by = .(is.na(hour), Cancelled == 1)]

## drop columns
hflights <- hflights[, .(Month, DayofMonth, DayOfWeek, Dest, Origin,
                         UniqueCarrier, FlightNum, TailNum, Distance,
                         Cancelled = factor(Cancelled))]


## re-upload to H2O
h2o.ls()
hflights.hex <- as.h2o(as.data.frame(hflights), 'hflights')

## split the data
h2o.splitFrame(data = hflights.hex , ratios = 0.75,
               destination_frames = paste0('hflights', 0:1))
h2o.ls()

## build the first model
hflights.rf <- h2o.randomForest(
    model_id = 'hflights_rf',
    x = setdiff(names(hflights), 'Cancelled'),
    y = 'Cancelled',
    training_frame = 'hflights0',
    validation_frame = 'hflights1')
hflights.rf

## more trees
hflights.rf <- h2o.randomForest(
    model_id = 'hflights_rf500',
    x = setdiff(names(hflights), 'Cancelled'),
    y = 'Cancelled',
    training_frame = 'hflights0',
    validation_frame = 'hflights1', ntrees = 500)

## change to UI and check ROC curve & AUC

## GBM
hflights.gbm <- h2o.gbm(
    x = setdiff(names(hflights), 'Cancelled'),
    y = 'Cancelled',
    training_frame = 'hflights0',
    validation_frame = 'hflights1',
    model_id = 'hflights_gbm')

## more trees should help, again !!!
hflights.gbm <- h2o.gbm(
    x = setdiff(names(hflights), 'Cancelled'),
    y = 'Cancelled',
    training_frame = 'hflights0',
    validation_frame = 'hflights1',
    model_id = 'hflights_gbm2', ntrees = 550)

## but no: although higher training AUC, lower validation AUC => overfit

## more trees should help, again !!!
hflights.gbm <- h2o.gbm(
    x = setdiff(names(hflights), 'Cancelled'),
    y = 'Cancelled',
    training_frame = 'hflights_part0',
    validation_frame = 'hflights_part1',
    model_id = 'hflights_gbm2', ntrees = 250, learn_rate = 0.01)

## bye
h2o.shutdown()
	## #############################################################################
	## PCA demo on image processing
	## #############################################################################

	download.file('http://bit.ly/nasa-image-pca', 'image.jpg') # mode = »bw«

	library(jpeg)
	img <- readJPEG('image.jpg')
	str(img)

	dim(img)
	h <- dim(img)[1]
	w <- dim(img)[2]

	img1d <- matrix(img, h * w)
	pca <- prcomp(img1d)
	pca
	summary(pca)

	image(matrix(pca$x[, 1], h))
	image(matrix(pca$x[, 2], h))
	image(matrix(pca$x[, 2], h), col = gray.colors(100))

	image(matrix(pca$x[, 3], h))

	## #############################################################################
	## MDS
	## #############################################################################

	download.file('http://bit.ly/hun-cities-distance', 'cities.xls') # mode = »bw«

	library(readxl)
	cities <- read_excel('cities .xls')

	## get rid of 1st column and last row (metadata)
	cities <- cities[, -1]
	cities <- cities[-nrow(cities), ]

	mds <- cmdscale(as.dist(cities))

	plot(mds)
	text(mds[, 1], mds[, 2], tail(names(cities), -1))

	mds <- -mds
	plot(mds)
	text(mds[, 1], mds[, 2], tail(names(cities), -1))

	mds <- cmdscale(dist(mtcars))
	plot(mds)
	text(mds[, 1], mds[, 2], rownames(mtcars))

	mds <- as.data.frame(mds)
	library(ggplot2) # start with geom_point
	ggplot(mds, aes(V1, -V2, label = rownames(mtcars))) + geom_text()

	library(ggrepel)
	ggplot(mds, aes(V1, -V2, label = rownames(mtcars))) + geom_text_repel()

	## #############################################################################
	## feature selection
	## #############################################################################

	str(iris)
	plot(iris$Sepal.Length, iris$Sepal.Width)

	fit <- lm(Sepal.Width ~ Sepal.Length, data = iris)
	fit
	summary(fit)

	plot(iris$Sepal.Length, iris$Sepal.Width)
	abline(fit, col = 'red', lwd = 5)

	## now try
	plot(iris$Sepal.Length, iris$Sepal.Width, col = iris$Species)
	summary(lm(Sepal.Width ~ Sepal.Length + Species, data = iris))

	ggplot(iris, aes(Sepal.Length, Sepal.Width, col = Species)) +
	geom_point() + geom_smooth(method = 'lm')

	ggplot(iris, aes(Sepal.Length, Sepal.Width)) +
	geom_point(aes(col = Species)) +
	geom_smooth(aes(col = Species), method = 'lm') +
	geom_smooth(method = 'lm', col = 'black') + theme_bw()

	## #############################################################################
	## PCA demo to show the steps in hierarchical clustering
	## #############################################################################

	prcomp(iris[, 1:4])
	PC <- prcomp(iris[, 1:4])$x
	dm <- dist(iris[, 1:4])
	hc <- hclust(dm)
	plot(hc)

	for (i in 2:8) {
	plot(hc)
	rect.hclust(hc, k = i, border = 'red')
	Sys.sleep(1)
	}

	library(animation)
	ani.options(interval = 1)
	saveGIF({
	for (i in 2:8) {
	plot(hc)
	rect.hclust(hc, k = i, border = 'red')
	}
	})

	library(animation)
	ani.options(interval = 1)
	saveGIF({
	for (i in 1:8) {
	plot(-PC[, 1:2], col = cutree(hc, i), pch = as.numeric(factor(iris$Species)) + 5)
	}
	})

	## #############################################################################
	## basic decision tress
	## #############################################################################

	rm(iris)

	set.seed(100)
	mysample <- sample(1:150, 100)
	str(mysample)

	train <- iris[mysample, ]
	test <- iris[setdiff(1:150, mysample), ]

	library(rpart)
	ct <- rpart(Species ~ ., data = train)
	summary(ct)

	plot(ct); text(ct)

	library(partykit)
	plot(as.party(ct))

	predict(ct, newdata = test)
	predict(ct, newdata = test, type = 'class')

	## confusion matrix
	table(test$Species,
	predict(ct, newdata = test,
	type = 'class'))

	ct <- rpart(Species ~ ., data = train,
	minsplit = 1, cp = 0.001)
	## ?rpart.control
	plot(ct); text(ct)
	plot(as.party(ct))

	table(test$Species,
	predict(ct, newdata = test,
	type = 'class'))

	## back to slides on high level overview on bagging, random forest, gbm etc

	## #############################################################################
	## H2O
	## #############################################################################

	## start and connect to H2O
	library(h2o)
	h2o.init()
	## in a browser check http://localhost:54321

	rpart()
	h2o.randomForest()

	## demo data
	library(hflights)
	rm(hflights)
	str(hflights)

	## copy to H2O
	hflights.hex <- as.h2o(hflights, 'hflights')

	str(hflights.hex)
	str(head(hflights.hex))
	head(hflights.hex, 3)
	summary(hflights.hex)
	## check flight number: numeric?
	## check on H2O web interface as well (enum vs factor)

	## convert numeric to factor/enum
	hflights.hex[, 'FlightNum'] <- as.factor(hflights.hex[, 'FlightNum'])
	summary(hflights.hex)

	## boring
	for (v in c('Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'ArrTime', 'Cancelled')) {
	hflights.hex[, v] <- as.factor(hflights.hex[, v])
	}

	summary(hflights.hex)

	## feature engineering: departure time? is it OK? hour of the day?
	## redo everything... just use the R script
	library(data.table)
	hflights <- data.table(hflights)
	hflights[, hour := substr(DepTime, 1, 2)]
	hflights[, .N, by = hour]

	hflights[, hour := substr(DepTime, 1, nchar(DepTime) - 2)]
	hflights[, .N, by = hour][order(hour)]
	hflights[hour == '']

	hflights[, hour := cut(as.numeric(hour), seq(0, 24, 4))]
	hflights[, .N, by = hour]

	hflights[is.na(hour)]
	hflights[, .N, by = .(is.na(hour), Cancelled == 1)]

	## drop columns
	hflights <- hflights[, .(Month, DayofMonth, DayOfWeek, Dest, Origin,
	UniqueCarrier, FlightNum, TailNum, Distance,
	Cancelled = factor(Cancelled))]


	## re-upload to H2O
	h2o.ls()
	hflights.hex <- as.h2o(as.data.frame(hflights), 'hflights')

	## split the data
	h2o.splitFrame(data = hflights.hex , ratios = 0.75,
	destination_frames = paste0('hflights', 0:1))
	h2o.ls()

	## build the first model
	hflights.rf <- h2o.randomForest(
	model_id = 'hflights_rf',
	x = setdiff(names(hflights), 'Cancelled'),
	y = 'Cancelled',
	training_frame = 'hflights0',
	validation_frame = 'hflights1')
	hflights.rf

	## more trees
	hflights.rf <- h2o.randomForest(
	model_id = 'hflights_rf500',
	x = setdiff(names(hflights), 'Cancelled'),
	y = 'Cancelled',
	training_frame = 'hflights0',
	validation_frame = 'hflights1', ntrees = 500)

	## change to UI and check ROC curve & AUC

	## GBM
	hflights.gbm <- h2o.gbm(
	x = setdiff(names(hflights), 'Cancelled'),
	y = 'Cancelled',
	training_frame = 'hflights0',
	validation_frame = 'hflights1',
	model_id = 'hflights_gbm')

	## more trees should help, again !!!
	hflights.gbm <- h2o.gbm(
	x = setdiff(names(hflights), 'Cancelled'),
	y = 'Cancelled',
	training_frame = 'hflights0',
	validation_frame = 'hflights1',
	model_id = 'hflights_gbm2', ntrees = 550)

	## but no: although higher training AUC, lower validation AUC => overfit

	## more trees should help, again !!!
	hflights.gbm <- h2o.gbm(
	x = setdiff(names(hflights), 'Cancelled'),
	y = 'Cancelled',
	training_frame = 'hflights_part0',
	validation_frame = 'hflights_part1',
	model_id = 'hflights_gbm2', ntrees = 250, learn_rate = 0.01)

	## bye
	h2o.shutdown()