Skip to content

Instantly share code, notes, and snippets.

@daroczig
Created November 19, 2018 09:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save daroczig/f9a0cba2691f99617d0bfd20578bf8e3 to your computer and use it in GitHub Desktop.
Save daroczig/f9a0cba2691f99617d0bfd20578bf8e3 to your computer and use it in GitHub Desktop.
BCE // Vállalati Pénzügyi Információs Rendszerek // 2018
## #############################################################################
## PCA demo on image processing
## #############################################################################
download.file('http://bit.ly/nasa-image-pca', 'image.jpg') # mode = »bw«
library(jpeg)
img <- readJPEG('image.jpg')
str(img)
dim(img)
h <- dim(img)[1]
w <- dim(img)[2]
img1d <- matrix(img, h * w)
pca <- prcomp(img1d)
pca
summary(pca)
image(matrix(pca$x[, 1], h))
image(matrix(pca$x[, 2], h))
image(matrix(pca$x[, 2], h), col = gray.colors(100))
image(matrix(pca$x[, 3], h))
## #############################################################################
## MDS
## #############################################################################
download.file('http://bit.ly/hun-cities-distance', 'cities.xls') # mode = »bw«
library(readxl)
cities <- read_excel('cities .xls')
## get rid of 1st column and last row (metadata)
cities <- cities[, -1]
cities <- cities[-nrow(cities), ]
mds <- cmdscale(as.dist(cities))
plot(mds)
text(mds[, 1], mds[, 2], tail(names(cities), -1))
mds <- -mds
plot(mds)
text(mds[, 1], mds[, 2], tail(names(cities), -1))
mds <- cmdscale(dist(mtcars))
plot(mds)
text(mds[, 1], mds[, 2], rownames(mtcars))
mds <- as.data.frame(mds)
library(ggplot2) # start with geom_point
ggplot(mds, aes(V1, -V2, label = rownames(mtcars))) + geom_text()
library(ggrepel)
ggplot(mds, aes(V1, -V2, label = rownames(mtcars))) + geom_text_repel()
## #############################################################################
## feature selection
## #############################################################################
str(iris)
plot(iris$Sepal.Length, iris$Sepal.Width)
fit <- lm(Sepal.Width ~ Sepal.Length, data = iris)
fit
summary(fit)
plot(iris$Sepal.Length, iris$Sepal.Width)
abline(fit, col = 'red', lwd = 5)
## now try
plot(iris$Sepal.Length, iris$Sepal.Width, col = iris$Species)
summary(lm(Sepal.Width ~ Sepal.Length + Species, data = iris))
ggplot(iris, aes(Sepal.Length, Sepal.Width, col = Species)) +
geom_point() + geom_smooth(method = 'lm')
ggplot(iris, aes(Sepal.Length, Sepal.Width)) +
geom_point(aes(col = Species)) +
geom_smooth(aes(col = Species), method = 'lm') +
geom_smooth(method = 'lm', col = 'black') + theme_bw()
## #############################################################################
## PCA demo to show the steps in hierarchical clustering
## #############################################################################
prcomp(iris[, 1:4])
PC <- prcomp(iris[, 1:4])$x
dm <- dist(iris[, 1:4])
hc <- hclust(dm)
plot(hc)
for (i in 2:8) {
plot(hc)
rect.hclust(hc, k = i, border = 'red')
Sys.sleep(1)
}
library(animation)
ani.options(interval = 1)
saveGIF({
for (i in 2:8) {
plot(hc)
rect.hclust(hc, k = i, border = 'red')
}
})
library(animation)
ani.options(interval = 1)
saveGIF({
for (i in 1:8) {
plot(-PC[, 1:2], col = cutree(hc, i), pch = as.numeric(factor(iris$Species)) + 5)
}
})
## #############################################################################
## basic decision tress
## #############################################################################
rm(iris)
set.seed(100)
mysample <- sample(1:150, 100)
str(mysample)
train <- iris[mysample, ]
test <- iris[setdiff(1:150, mysample), ]
library(rpart)
ct <- rpart(Species ~ ., data = train)
summary(ct)
plot(ct); text(ct)
library(partykit)
plot(as.party(ct))
predict(ct, newdata = test)
predict(ct, newdata = test, type = 'class')
## confusion matrix
table(test$Species,
predict(ct, newdata = test,
type = 'class'))
ct <- rpart(Species ~ ., data = train,
minsplit = 1, cp = 0.001)
## ?rpart.control
plot(ct); text(ct)
plot(as.party(ct))
table(test$Species,
predict(ct, newdata = test,
type = 'class'))
## back to slides on high level overview on bagging, random forest, gbm etc
## #############################################################################
## H2O
## #############################################################################
## start and connect to H2O
library(h2o)
h2o.init()
## in a browser check http://localhost:54321
rpart()
h2o.randomForest()
## demo data
library(hflights)
rm(hflights)
str(hflights)
## copy to H2O
hflights.hex <- as.h2o(hflights, 'hflights')
str(hflights.hex)
str(head(hflights.hex))
head(hflights.hex, 3)
summary(hflights.hex)
## check flight number: numeric?
## check on H2O web interface as well (enum vs factor)
## convert numeric to factor/enum
hflights.hex[, 'FlightNum'] <- as.factor(hflights.hex[, 'FlightNum'])
summary(hflights.hex)
## boring
for (v in c('Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'ArrTime', 'Cancelled')) {
hflights.hex[, v] <- as.factor(hflights.hex[, v])
}
summary(hflights.hex)
## feature engineering: departure time? is it OK? hour of the day?
## redo everything... just use the R script
library(data.table)
hflights <- data.table(hflights)
hflights[, hour := substr(DepTime, 1, 2)]
hflights[, .N, by = hour]
hflights[, hour := substr(DepTime, 1, nchar(DepTime) - 2)]
hflights[, .N, by = hour][order(hour)]
hflights[hour == '']
hflights[, hour := cut(as.numeric(hour), seq(0, 24, 4))]
hflights[, .N, by = hour]
hflights[is.na(hour)]
hflights[, .N, by = .(is.na(hour), Cancelled == 1)]
## drop columns
hflights <- hflights[, .(Month, DayofMonth, DayOfWeek, Dest, Origin,
UniqueCarrier, FlightNum, TailNum, Distance,
Cancelled = factor(Cancelled))]
## re-upload to H2O
h2o.ls()
hflights.hex <- as.h2o(as.data.frame(hflights), 'hflights')
## split the data
h2o.splitFrame(data = hflights.hex , ratios = 0.75,
destination_frames = paste0('hflights', 0:1))
h2o.ls()
## build the first model
hflights.rf <- h2o.randomForest(
model_id = 'hflights_rf',
x = setdiff(names(hflights), 'Cancelled'),
y = 'Cancelled',
training_frame = 'hflights0',
validation_frame = 'hflights1')
hflights.rf
## more trees
hflights.rf <- h2o.randomForest(
model_id = 'hflights_rf500',
x = setdiff(names(hflights), 'Cancelled'),
y = 'Cancelled',
training_frame = 'hflights0',
validation_frame = 'hflights1', ntrees = 500)
## change to UI and check ROC curve & AUC
## GBM
hflights.gbm <- h2o.gbm(
x = setdiff(names(hflights), 'Cancelled'),
y = 'Cancelled',
training_frame = 'hflights0',
validation_frame = 'hflights1',
model_id = 'hflights_gbm')
## more trees should help, again !!!
hflights.gbm <- h2o.gbm(
x = setdiff(names(hflights), 'Cancelled'),
y = 'Cancelled',
training_frame = 'hflights0',
validation_frame = 'hflights1',
model_id = 'hflights_gbm2', ntrees = 550)
## but no: although higher training AUC, lower validation AUC => overfit
## more trees should help, again !!!
hflights.gbm <- h2o.gbm(
x = setdiff(names(hflights), 'Cancelled'),
y = 'Cancelled',
training_frame = 'hflights_part0',
validation_frame = 'hflights_part1',
model_id = 'hflights_gbm2', ntrees = 250, learn_rate = 0.01)
## bye
h2o.shutdown()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment