daroczig/Crunch-2018-R-workshop.R

## Crunch-2018-R-workshop.R
## #############################################################################
## intro slides: http://bit.ly/CRUNCH-R-2018

## #############################################################################
## intro to R

## basic operations
1 + 3
3 * 2

## constants
pi
"pi"
'pi'

## getting help
?pi
month.name
month.abb
apropos('month')

## vectors
letters
LETTERS
str(letters)

## custom vectors
letters[1]
letters[5]
letters[1:5]
1:5

seq(1, 5)
?seq
seq(1, 5, 0.1)
seq(to = 5, by = 0.1, from = 1)

## variables
x = 4  # works, but not suggested
x
x <- 4 # works, and recommened

x * 2
x ^ 2

## TODO compute the square root of x
sqrt(x)
x ^ 0.5

## functions
sin(pi / 2)
round(pi)
?round
round(pi, digits = 2)
ceiling(pi)
floor(pi)

runif(1)
?runif
runif(5)

## custom functions
f <- function(x) 2 * x + 1
f <- function(x) {
  2 * x + 1
}
f(5)
f(pi)

## vectorized operations
f(1:5)

x <- seq(1, 5, 0.1)
x
f(x)

## visualize function
plot(x, f(x))
?plot
plot(x, f(x), type = 'l')
plot(x, f(x), type = 'b')

## TODO draw 1 period of sine curve / wave
?sin
f <- function(x) sin(x)
x <- seq(0, pi * 2, by = 0.1)
plot(x, sin(x), type = 'l', xlim = c(2, 4))

## a simpler way for plotting functions
?curve
curve(f, from = 0, to = pi * 2)
grid()

## TODO Brownian motion / random walk in 1D
set.seed(42)
x <- 0
for (i in 1:500) {
  x <- x + sign(runif(1) - 0.5)
}
x
## Rcpp

round(runif(1)) - 0.5
?sign
sign(runif(10) - 0.5)
sign(runif(1, min = -0.5, max = 0.5))

cumsum(sign(runif(25, min = -0.5, max = 0.5)))

x <- sign(runif(25, min = -0.5, max = 0.5))
x
cumsum(x)
cumsum(x)[25]
cumsum(x)[length(x)]
tail(cumsum(x), 1)
sum(x)


plot(1:25, cumsum(sign(runif(25, min = -0.5, max = 0.5))), type = 's')
for (i in 1:5) {
  lines(cumsum(sign(runif(25, min = -0.5, max = 0.5))), type = 's', col = i)
}

## pro tipp #1
plot(c(1, 25), c(-25, 25), type = 'n')
plot(NULL, NULL, xlim = c(1, 25), ylim = c(-25, 25))

## pro tipp #2
sample(c(-1, 1), 25, replace = TRUE)

## custom vectors -> combine values into vector
h <- c(174, 170, 160)
w <- c(90, 80, 70)

plot(h, w)
plot(h, w, main = 'Heights and weights', xlab = 'Height', ylab = 'Weight')

## some basic modeling
cor(h, w)
?lm
lm(w ~ h) # formula notation => predicting w based on h

172 * 1.346 - 146.154

fit <- lm(w ~ h)
lm(h ~ w)
fit

predict(fit, newdata = list(h = 172))

str(fit)
fit
summary(fit)

predict(fit, newdata = list(h = 172))

predict(fit, newdata = list(h = c(56, 104)))

## we need more data
## we need more complex models
## ...
## extrapolating!

## TODO visualize the lm
plot(h, w, xlim = c(0, 250), ylim = c(0, 150))
fit
abline(fit, col = 'red')

## example for polynomial model
fit <- lm(w ~ poly(h, 2, raw = TRUE))
fit <- lm(w ~ h + I(h^2))
fit

## #############################################################################
## intro to data.frame

df <- data.frame(weight = w, height = h)
df
str(df)

df[1, 2]
df[2, 2]
df[1, ]
df[, 2]
str(df)
df$weight
df[, 1]
df$weight[2]

plot(df)
cor(df)

## TODO let's create a 3rd column: BMI
## Body Mass Index (BMI) is a person's weight in kilograms divided by the square of height in meters
df
df$bmi <- df$weight / (df$height / 100) ^ 2
df

## get more data
?read.csv
df <- read.csv('http://bit.ly/heightweight')
str(df)
cor(df[, 2:5])
cor(df[, -1])

names(df)
plot(df)

## drop (redundant) column
df$ageMonth <- NULL
str(df)

## more EDA
install.packages('GGally')
library(GGally)
pairs(df)
ggpairs(df)

library(pairsD3)
pairsD3(df)

summary(df)
min(df$ageYear)
max(df$ageYear)
mean(df$ageYear)
median(df$ageYear)
sd(df$ageYear)
length(df$ageYear)
nrow(df)
ncol(df)
dim(df)
c(nrow(df), ncol(df))

range(df$ageYear)
diff(range(df$ageYear))

## TODO compute BMI (kg, cm)
df$height <- df$heightIn * 2.54
df$weight <- df$weightLb * 0.45
df$bmi <- df$weight / (df$height / 100) ^ 2

## TODO visualize the lm predicting weight based on height
lm(df$weight ~ df$height)
fit <- lm(weight ~ height, data = df)

plot(df$height, df$weight)
abline(fit, col = 'red')

## TODO check if there's a diff in the height of m/f
t.test(height ~ sex, data = df)
t.test(weight ~ sex, data = df)

?t.test
?aov
?TukeyHSD

aov(height ~ sex, data = df)
summary(aov(height ~ sex, data = df))
summary(aov(weight ~ sex, data = df))

TukeyHSD(aov(height ~ sex, data = df))
TukeyHSD(aov(weight ~ sex, data = df))

## #############################################################################
## ggplot2: The R implementation of the Grammar of Graphics

library(ggplot2)
?diamonds
str(diamonds)

## barplot
ggplot(diamonds, aes(x = cut)) + geom_bar()
ggplot(diamonds, aes(x = cut)) + geom_bar() + theme_bw()
ggplot(diamonds, aes(x = cut)) +
  geom_bar(colour = 'orange', fill = 'white') +
  theme_bw()

p <- ggplot(diamonds, aes(x = cut))
p
p + geom_bar()
p <- p + geom_bar()
p
p + theme_bw()

## coord transformations
p
p + scale_y_log10()
p + scale_y_sqrt()
p + scale_y_reverse()
p + coord_flip()

## facets
p
p + facet_wrap( ~ clarity)
str(diamonds)
p + facet_grid(color ~ clarity, scales = 'free')
## formula: lhs ~ rhs1 + rhs2

## histogram
ggplot(diamonds, aes(x = price)) + geom_histogram(binwidth = 10)
ggplot(diamonds, aes(x = price)) + geom_histogram(binwidth = 100)
ggplot(diamonds, aes(x = price)) + geom_histogram(binwidth = 1000)
ggplot(diamonds, aes(x = price)) + geom_histogram(binwidth = 10000)

## back to barplots: stacked and clustered bar charts
ggplot(diamonds, aes(x = cut)) + geom_bar(fill = 'white')

ggplot(diamonds, aes(x = cut)) + geom_bar(aes(fill = clarity))
ggplot(diamonds, aes(x = cut, fill = clarity)) + geom_bar()

p <- ggplot(diamonds, aes(x = cut, fill = clarity))
p + geom_bar()
p + geom_bar(position = 'fill')
p + geom_bar(position = 'dodge')

## density function
ggplot(diamonds, aes(x = price)) + geom_histogram(binwidth = 1000)
ggplot(diamonds, aes(x = price, fill = cut)) +
  geom_histogram(binwidth = 1000)

ggplot(diamonds, aes(price, fill = cut)) + geom_density()
ggplot(diamonds, aes(price, fill = cut)) + geom_density(alpha = 0.25)

## scatterplot
ggplot(diamonds, aes(x = carat, y = price)) + geom_point()
ggplot(diamonds, aes(x = carat, y = price)) + geom_point(alpha = 0.1)

## right geom?
install.packages('hexbin')
ggplot(diamonds, aes(x = carat, y = price)) + geom_hex()

ggplot(diamonds, aes(x = carat, y = price)) + geom_hex() +
  geom_smooth(method = 'lm') +
  geom_smooth(color = 'red')

## themes
library(ggthemes)
p <- ggplot(diamonds, aes(carat, price, color = cut)) + geom_point()
p
p + theme_economist() + scale_colour_economist()
p + theme_stata() + scale_colour_stata()
p + theme_excel() + scale_colour_excel()
p + theme_wsj() + scale_colour_wsj("colors6", "")
p + theme_gdocs() + scale_colour_gdocs()

p
p + theme_bw()
theme_bw
p + theme(axis.text.y = element_text(angle = 90, hjust = 0.5))

## create a custom theme for future usage
theme_custom <- function() {
    theme(
        axis.text = element_text(
            family = 'Times New Roman',
            color  = "orange",
            size   = 12,
            face   = "italic"),
        axis.title = element_text(
            family = 'Times New Roman',
            color  = "orange",
            size   = 16,
            face   = "bold"),
        axis.text.y = element_text(angle = 90, hjust = 0.5),
        panel.background = element_rect(
            fill = "orange",
            color = "white",
            size = 2)
    )
}
p + theme_custom()

?mtcars

## TODO plot the distribution of horsepower
ggplot(mtcars, aes(x = hp)) + geom_histogram()

## TODO plot the distribution of the number carburators
ggplot(mtcars, aes(x = carb)) + geom_bar()
ggplot(mtcars, aes(x = factor(carb))) + geom_bar()
mtcars$carb <- factor(mtcars$carb)

## TODO barplot of carburators per am
ggplot(mtcars, aes(carb)) + geom_bar() + facet_wrap(~am)
ggplot(mtcars, aes(carb, fill = factor(am))) + geom_bar()

## TODO boxplot of hp by carb
ggplot(mtcars, aes(x = factor(carb), y = hp)) + geom_boxplot()
table(mtcars$carb)

## TODO plot hp and wt by carb
ggplot(mtcars, aes(hp, wt)) + geom_point() + facet_wrap(~carb)

## TODO plot regression (lm) on wt predicting hp
ggplot(mtcars, aes(wt, hp)) + geom_point() + geom_smooth(method = 'lm')
?geom_smooth
ggplot(mtcars, aes(wt, hp)) + geom_point() + geom_smooth(method = 'lm', se = FALSE)

## #############################################################################
## feature selection example

?iris
str(iris)

ggplot(iris, aes(Sepal.Length, Sepal.Width)) + geom_point()
ggplot(iris, aes(Sepal.Length, Sepal.Width)) + geom_point() +
  geom_smooth(method = 'lm')

ggplot(iris, aes(Sepal.Length, Sepal.Width)) + geom_point() +
  geom_smooth()

## Simpon's paradox
ggplot(iris, aes(Sepal.Length, Sepal.Width)) +
  geom_point(aes(color = Species)) +
  geom_smooth(aes(color = Species), method = 'lm') +
  geom_smooth(method = 'lm', color = 'black') +
  theme(legend.position = 'top')

summary(lm(Sepal.Width ~ Sepal.Length, data = iris))
summary(lm(Sepal.Width ~ Sepal.Length + Species, data = iris))

str(iris)

## #############################################################################
## hierachical clustering

?hclust

dm <- dist(iris[, 1:4])
str(dm)
hc <- hclust(dm)
plot(hc)
rect.hclust(hc, k = 3)

clusters <- cutree(hc, 3)

plot(iris$Sepal.Length, iris$Sepal.Width, col = iris$Species)
plot(iris$Sepal.Length, iris$Sepal.Width, col = iris$Species,
     pch = clusters)

table(iris$Species)
table(clusters)
table(iris$Species, clusters)

for (i in 2:8) {
  plot(hc)
  rect.hclust(hc, k = i)
  Sys.sleep(1)
}

library(animation)
ani.options(interval = 1)
saveGIF({
  for (i in 2:8) {
    plot(hc)
    rect.hclust(hc, k = i)
  }
})

saveGIF({
  for (i in 2:8) {
    clusters <- cutree(hc, i)
    plot(iris$Sepal.Length, iris$Sepal.Width, col = clusters)
  }
})

## #############################################################################
## supervised learning: a basic CART decision tree

str(iris)

install.packages('rpart')
library(rpart)
fit <- rpart(Species ~ Sepal.Length + Sepal.Width +
        Petal.Length + Petal.Width, data = iris)
plot(fit)
text(fit)

library(partykit)
plot(as.party(fit))
fit

predict(fit)
predict(fit, type = 'class')

table(iris$Species, predict(fit, type = 'class'))

## #############################################################################
## Principal Component Analysis (PCA)

download.file('http://bit.ly/nasa-image-pca', 'image.jpg', mode = 'wb')
?download.file

library(jpeg)
img <- readJPEG('image.jpg')
str(img) ## RGB 0-1
dim(img)

h <- dim(img)[1]
w <- dim(img)[2]
h
w

img2d <- matrix(img, h * w)
str(img2d)

pca <- prcomp(img2d)
summary(pca)

str(pca)

image(matrix(pca$x[, 1], h))
image(matrix(pca$x[, 2], h))

## #############################################################################
## Multi-Dimensional Scaling (MDS)

download.file('http://bit.ly/hun-cities-distance',
              'cities.xls')
library(readxl)
cities <- read_excel('cities.xls')
str(cities)
## cities$`Települések távolsága közúton (km)` <- NULL
cities[, 2:41]
str(cities[, -1])

cities <- cities[, -1]
str(cities)
cities[41, ]
cities <- cities[1:40, ]
str(cities)

mds <- cmdscale(as.dist(cities))
mds

plot(mds)
text(mds[, 1], mds[, 2], names(cities))

## use mds with non-geo stuff
str(mtcars)
mtcars

mds <- cmdscale(dist(mtcars))
mds

plot(mds)
text(mds[, 1], mds[, 2], rownames(mtcars))

library(ggplot2)
str(mds)
mds <- as.data.frame(mds)
str(mds)
mds$car <- rownames(mtcars)
str(mds)

ggplot(mds, aes(V1, V2, label = car)) + geom_text()

library(ggrepel)
ggplot(mds, aes(V1, V2, label = car)) + geom_text_repel()

## #############################################################################
## H2O example for random forest and gbm

## start and connect to H2O
library(h2o)
h2o.init()
## in a browser check http://localhost:54321

## demo data
library(hflights)
rm(hflights)
str(hflights)

## copy to H2O
hflights.hex <- as.h2o(hflights, 'hflights')

str(hflights.hex)
str(head(hflights.hex))
head(hflights.hex, 3)
summary(hflights.hex)
## check flight number: numeric?
## check on H2O web interface as well (enum vs factor)

## convert numeric to factor/enum
hflights.hex[, 'FlightNum'] <- as.factor(hflights.hex[, 'FlightNum'])
summary(hflights.hex)

## boring
for (v in c('Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'ArrTime', 'Cancelled')) {
    hflights.hex[, v] <- as.factor(hflights.hex[, v])
}

summary(hflights.hex)

## feature engineering: departure time? is it OK? hour of the day?
## redo everything... just use the R script
library(data.table)
hflights <- data.table(hflights)
hflights[, hour := substr(DepTime, 1, 2)]
hflights[, .N, by = hour]

hflights[, hour := substr(DepTime, 1, nchar(DepTime) - 2)]
hflights[, .N, by = hour][order(hour)]
hflights[hour == '']

hflights[, hour := cut(as.numeric(hour), seq(0, 24, 4))]
hflights[, .N, by = hour]

hflights[is.na(hour)]
hflights[, .N, by = .(is.na(hour), Cancelled == 1)]

## drop columns
hflights <- hflights[, .(Month, DayofMonth, DayOfWeek, Dest, Origin,
                         UniqueCarrier, FlightNum, TailNum, Distance,
                         Cancelled = factor(Cancelled))]

## re-upload to H2O
h2o.ls()
hflights.hex <- as.h2o(as.data.frame(hflights), 'hflights')

## split the data
h2o.splitFrame(data = hflights.hex , ratios = 0.75,
               destination_frames = paste0('hflights', 0:1))
h2o.ls()

## build the first model
hflights.rf <- h2o.randomForest(
    model_id = 'hflights_rf',
    x = setdiff(names(hflights), 'Cancelled'),
    y = 'Cancelled',
    training_frame = 'hflights0',
    validation_frame = 'hflights1')
hflights.rf

## more trees
hflights.rf <- h2o.randomForest(
    model_id = 'hflights_rf500',
    x = setdiff(names(hflights), 'Cancelled'),
    y = 'Cancelled',
    training_frame = 'hflights0',
    validation_frame = 'hflights1', ntrees = 500)

## change to UI and check ROC curve & AUC

## GBM
hflights.gbm <- h2o.gbm(
    x = setdiff(names(hflights), 'Cancelled'),
    y = 'Cancelled',
    training_frame = 'hflights0',
    validation_frame = 'hflights1',
    model_id = 'hflights_gbm')

## more trees should help, again !!!
hflights.gbm <- h2o.gbm(
    x = setdiff(names(hflights), 'Cancelled'),
    y = 'Cancelled',
    training_frame = 'hflights0',
    validation_frame = 'hflights1',
    model_id = 'hflights_gbm2', ntrees = 550)

## but no: although higher training AUC, lower validation AUC => overfit

## more trees should help, again !!!
hflights.gbm <- h2o.gbm(
    x = setdiff(names(hflights), 'Cancelled'),
    y = 'Cancelled',
    training_frame = 'hflights_part0',
    validation_frame = 'hflights_part1',
    model_id = 'hflights_gbm2', ntrees = 250, learn_rate = 0.01)

## bye
h2o.shutdown()
	## #############################################################################
	## intro slides: http://bit.ly/CRUNCH-R-2018

	## #############################################################################
	## intro to R

	## basic operations
	1 + 3
	3 * 2

	## constants
	pi
	"pi"
	'pi'

	## getting help
	?pi
	month.name
	month.abb
	apropos('month')

	## vectors
	letters
	LETTERS
	str(letters)

	## custom vectors
	letters[1]
	letters[5]
	letters[1:5]
	1:5

	seq(1, 5)
	?seq
	seq(1, 5, 0.1)
	seq(to = 5, by = 0.1, from = 1)

	## variables
	x = 4 # works, but not suggested
	x
	x <- 4 # works, and recommened

	x * 2
	x ^ 2

	## TODO compute the square root of x
	sqrt(x)
	x ^ 0.5

	## functions
	sin(pi / 2)
	round(pi)
	?round
	round(pi, digits = 2)
	ceiling(pi)
	floor(pi)

	runif(1)
	?runif
	runif(5)

	## custom functions
	f <- function(x) 2 * x + 1
	f <- function(x) {
	2 * x + 1
	}
	f(5)
	f(pi)

	## vectorized operations
	f(1:5)

	x <- seq(1, 5, 0.1)
	x
	f(x)

	## visualize function
	plot(x, f(x))
	?plot
	plot(x, f(x), type = 'l')
	plot(x, f(x), type = 'b')

	## TODO draw 1 period of sine curve / wave
	?sin
	f <- function(x) sin(x)
	x <- seq(0, pi * 2, by = 0.1)
	plot(x, sin(x), type = 'l', xlim = c(2, 4))

	## a simpler way for plotting functions
	?curve
	curve(f, from = 0, to = pi * 2)
	grid()

	## TODO Brownian motion / random walk in 1D
	set.seed(42)
	x <- 0
	for (i in 1:500) {
	x <- x + sign(runif(1) - 0.5)
	}
	x
	## Rcpp

	round(runif(1)) - 0.5
	?sign
	sign(runif(10) - 0.5)
	sign(runif(1, min = -0.5, max = 0.5))

	cumsum(sign(runif(25, min = -0.5, max = 0.5)))

	x <- sign(runif(25, min = -0.5, max = 0.5))
	x
	cumsum(x)
	cumsum(x)[25]
	cumsum(x)[length(x)]
	tail(cumsum(x), 1)
	sum(x)


	plot(1:25, cumsum(sign(runif(25, min = -0.5, max = 0.5))), type = 's')
	for (i in 1:5) {
	lines(cumsum(sign(runif(25, min = -0.5, max = 0.5))), type = 's', col = i)
	}

	## pro tipp #1
	plot(c(1, 25), c(-25, 25), type = 'n')
	plot(NULL, NULL, xlim = c(1, 25), ylim = c(-25, 25))

	## pro tipp #2
	sample(c(-1, 1), 25, replace = TRUE)

	## custom vectors -> combine values into vector
	h <- c(174, 170, 160)
	w <- c(90, 80, 70)

	plot(h, w)
	plot(h, w, main = 'Heights and weights', xlab = 'Height', ylab = 'Weight')

	## some basic modeling
	cor(h, w)
	?lm
	lm(w ~ h) # formula notation => predicting w based on h

	172 * 1.346 - 146.154

	fit <- lm(w ~ h)
	lm(h ~ w)
	fit

	predict(fit, newdata = list(h = 172))

	str(fit)
	fit
	summary(fit)

	predict(fit, newdata = list(h = 172))

	predict(fit, newdata = list(h = c(56, 104)))

	## we need more data
	## we need more complex models
	## ...
	## extrapolating!

	## TODO visualize the lm
	plot(h, w, xlim = c(0, 250), ylim = c(0, 150))
	fit
	abline(fit, col = 'red')

	## example for polynomial model
	fit <- lm(w ~ poly(h, 2, raw = TRUE))
	fit <- lm(w ~ h + I(h^2))
	fit

	## #############################################################################
	## intro to data.frame

	df <- data.frame(weight = w, height = h)
	df
	str(df)

	df[1, 2]
	df[2, 2]
	df[1, ]
	df[, 2]
	str(df)
	df$weight
	df[, 1]
	df$weight[2]

	plot(df)
	cor(df)

	## TODO let's create a 3rd column: BMI
	## Body Mass Index (BMI) is a person's weight in kilograms divided by the square of height in meters
	df
	df$bmi <- df$weight / (df$height / 100) ^ 2
	df

	## get more data
	?read.csv
	df <- read.csv('http://bit.ly/heightweight')
	str(df)
	cor(df[, 2:5])
	cor(df[, -1])

	names(df)
	plot(df)

	## drop (redundant) column
	df$ageMonth <- NULL
	str(df)

	## more EDA
	install.packages('GGally')
	library(GGally)
	pairs(df)
	ggpairs(df)

	library(pairsD3)
	pairsD3(df)

	summary(df)
	min(df$ageYear)
	max(df$ageYear)
	mean(df$ageYear)
	median(df$ageYear)
	sd(df$ageYear)
	length(df$ageYear)
	nrow(df)
	ncol(df)
	dim(df)
	c(nrow(df), ncol(df))

	range(df$ageYear)
	diff(range(df$ageYear))

	## TODO compute BMI (kg, cm)
	df$height <- df$heightIn * 2.54
	df$weight <- df$weightLb * 0.45
	df$bmi <- df$weight / (df$height / 100) ^ 2

	## TODO visualize the lm predicting weight based on height
	lm(df$weight ~ df$height)
	fit <- lm(weight ~ height, data = df)

	plot(df$height, df$weight)
	abline(fit, col = 'red')

	## TODO check if there's a diff in the height of m/f
	t.test(height ~ sex, data = df)
	t.test(weight ~ sex, data = df)

	?t.test
	?aov
	?TukeyHSD

	aov(height ~ sex, data = df)
	summary(aov(height ~ sex, data = df))
	summary(aov(weight ~ sex, data = df))

	TukeyHSD(aov(height ~ sex, data = df))
	TukeyHSD(aov(weight ~ sex, data = df))

	## #############################################################################
	## ggplot2: The R implementation of the Grammar of Graphics

	library(ggplot2)
	?diamonds
	str(diamonds)

	## barplot
	ggplot(diamonds, aes(x = cut)) + geom_bar()
	ggplot(diamonds, aes(x = cut)) + geom_bar() + theme_bw()
	ggplot(diamonds, aes(x = cut)) +
	geom_bar(colour = 'orange', fill = 'white') +
	theme_bw()

	p <- ggplot(diamonds, aes(x = cut))
	p
	p + geom_bar()
	p <- p + geom_bar()
	p
	p + theme_bw()

	## coord transformations
	p
	p + scale_y_log10()
	p + scale_y_sqrt()
	p + scale_y_reverse()
	p + coord_flip()

	## facets
	p
	p + facet_wrap( ~ clarity)
	str(diamonds)
	p + facet_grid(color ~ clarity, scales = 'free')
	## formula: lhs ~ rhs1 + rhs2

	## histogram
	ggplot(diamonds, aes(x = price)) + geom_histogram(binwidth = 10)
	ggplot(diamonds, aes(x = price)) + geom_histogram(binwidth = 100)
	ggplot(diamonds, aes(x = price)) + geom_histogram(binwidth = 1000)
	ggplot(diamonds, aes(x = price)) + geom_histogram(binwidth = 10000)

	## back to barplots: stacked and clustered bar charts
	ggplot(diamonds, aes(x = cut)) + geom_bar(fill = 'white')

	ggplot(diamonds, aes(x = cut)) + geom_bar(aes(fill = clarity))
	ggplot(diamonds, aes(x = cut, fill = clarity)) + geom_bar()

	p <- ggplot(diamonds, aes(x = cut, fill = clarity))
	p + geom_bar()
	p + geom_bar(position = 'fill')
	p + geom_bar(position = 'dodge')

	## density function
	ggplot(diamonds, aes(x = price)) + geom_histogram(binwidth = 1000)
	ggplot(diamonds, aes(x = price, fill = cut)) +
	geom_histogram(binwidth = 1000)

	ggplot(diamonds, aes(price, fill = cut)) + geom_density()
	ggplot(diamonds, aes(price, fill = cut)) + geom_density(alpha = 0.25)

	## scatterplot
	ggplot(diamonds, aes(x = carat, y = price)) + geom_point()
	ggplot(diamonds, aes(x = carat, y = price)) + geom_point(alpha = 0.1)

	## right geom?
	install.packages('hexbin')
	ggplot(diamonds, aes(x = carat, y = price)) + geom_hex()

	ggplot(diamonds, aes(x = carat, y = price)) + geom_hex() +
	geom_smooth(method = 'lm') +
	geom_smooth(color = 'red')

	## themes
	library(ggthemes)
	p <- ggplot(diamonds, aes(carat, price, color = cut)) + geom_point()
	p
	p + theme_economist() + scale_colour_economist()
	p + theme_stata() + scale_colour_stata()
	p + theme_excel() + scale_colour_excel()
	p + theme_wsj() + scale_colour_wsj("colors6", "")
	p + theme_gdocs() + scale_colour_gdocs()

	p
	p + theme_bw()
	theme_bw
	p + theme(axis.text.y = element_text(angle = 90, hjust = 0.5))

	## create a custom theme for future usage
	theme_custom <- function() {
	theme(
	axis.text = element_text(
	family = 'Times New Roman',
	color = "orange",
	size = 12,
	face = "italic"),
	axis.title = element_text(
	family = 'Times New Roman',
	color = "orange",
	size = 16,
	face = "bold"),
	axis.text.y = element_text(angle = 90, hjust = 0.5),
	panel.background = element_rect(
	fill = "orange",
	color = "white",
	size = 2)
	)
	}
	p + theme_custom()

	?mtcars

	## TODO plot the distribution of horsepower
	ggplot(mtcars, aes(x = hp)) + geom_histogram()

	## TODO plot the distribution of the number carburators
	ggplot(mtcars, aes(x = carb)) + geom_bar()
	ggplot(mtcars, aes(x = factor(carb))) + geom_bar()
	mtcars$carb <- factor(mtcars$carb)

	## TODO barplot of carburators per am
	ggplot(mtcars, aes(carb)) + geom_bar() + facet_wrap(~am)
	ggplot(mtcars, aes(carb, fill = factor(am))) + geom_bar()

	## TODO boxplot of hp by carb
	ggplot(mtcars, aes(x = factor(carb), y = hp)) + geom_boxplot()
	table(mtcars$carb)

	## TODO plot hp and wt by carb
	ggplot(mtcars, aes(hp, wt)) + geom_point() + facet_wrap(~carb)

	## TODO plot regression (lm) on wt predicting hp
	ggplot(mtcars, aes(wt, hp)) + geom_point() + geom_smooth(method = 'lm')
	?geom_smooth
	ggplot(mtcars, aes(wt, hp)) + geom_point() + geom_smooth(method = 'lm', se = FALSE)

	## #############################################################################
	## feature selection example

	?iris
	str(iris)

	ggplot(iris, aes(Sepal.Length, Sepal.Width)) + geom_point()
	ggplot(iris, aes(Sepal.Length, Sepal.Width)) + geom_point() +
	geom_smooth(method = 'lm')

	ggplot(iris, aes(Sepal.Length, Sepal.Width)) + geom_point() +
	geom_smooth()

	## Simpon's paradox
	ggplot(iris, aes(Sepal.Length, Sepal.Width)) +
	geom_point(aes(color = Species)) +
	geom_smooth(aes(color = Species), method = 'lm') +
	geom_smooth(method = 'lm', color = 'black') +
	theme(legend.position = 'top')

	summary(lm(Sepal.Width ~ Sepal.Length, data = iris))
	summary(lm(Sepal.Width ~ Sepal.Length + Species, data = iris))

	str(iris)

	## #############################################################################
	## hierachical clustering

	?hclust

	dm <- dist(iris[, 1:4])
	str(dm)
	hc <- hclust(dm)
	plot(hc)
	rect.hclust(hc, k = 3)

	clusters <- cutree(hc, 3)

	plot(iris$Sepal.Length, iris$Sepal.Width, col = iris$Species)
	plot(iris$Sepal.Length, iris$Sepal.Width, col = iris$Species,
	pch = clusters)

	table(iris$Species)
	table(clusters)
	table(iris$Species, clusters)

	for (i in 2:8) {
	plot(hc)
	rect.hclust(hc, k = i)
	Sys.sleep(1)
	}

	library(animation)
	ani.options(interval = 1)
	saveGIF({
	for (i in 2:8) {
	plot(hc)
	rect.hclust(hc, k = i)
	}
	})

	saveGIF({
	for (i in 2:8) {
	clusters <- cutree(hc, i)
	plot(iris$Sepal.Length, iris$Sepal.Width, col = clusters)
	}
	})

	## #############################################################################
	## supervised learning: a basic CART decision tree

	str(iris)

	install.packages('rpart')
	library(rpart)
	fit <- rpart(Species ~ Sepal.Length + Sepal.Width +
	Petal.Length + Petal.Width, data = iris)
	plot(fit)
	text(fit)

	library(partykit)
	plot(as.party(fit))
	fit

	predict(fit)
	predict(fit, type = 'class')

	table(iris$Species, predict(fit, type = 'class'))

	## #############################################################################
	## Principal Component Analysis (PCA)

	download.file('http://bit.ly/nasa-image-pca', 'image.jpg', mode = 'wb')
	?download.file

	library(jpeg)
	img <- readJPEG('image.jpg')
	str(img) ## RGB 0-1
	dim(img)

	h <- dim(img)[1]
	w <- dim(img)[2]
	h
	w

	img2d <- matrix(img, h * w)
	str(img2d)

	pca <- prcomp(img2d)
	summary(pca)

	str(pca)

	image(matrix(pca$x[, 1], h))
	image(matrix(pca$x[, 2], h))

	## #############################################################################
	## Multi-Dimensional Scaling (MDS)

	download.file('http://bit.ly/hun-cities-distance',
	'cities.xls')
	library(readxl)
	cities <- read_excel('cities.xls')
	str(cities)
	## cities$`Települések távolsága közúton (km)` <- NULL
	cities[, 2:41]
	str(cities[, -1])

	cities <- cities[, -1]
	str(cities)
	cities[41, ]
	cities <- cities[1:40, ]
	str(cities)

	mds <- cmdscale(as.dist(cities))
	mds

	plot(mds)
	text(mds[, 1], mds[, 2], names(cities))

	## use mds with non-geo stuff
	str(mtcars)
	mtcars

	mds <- cmdscale(dist(mtcars))
	mds

	plot(mds)
	text(mds[, 1], mds[, 2], rownames(mtcars))

	library(ggplot2)
	str(mds)
	mds <- as.data.frame(mds)
	str(mds)
	mds$car <- rownames(mtcars)
	str(mds)

	ggplot(mds, aes(V1, V2, label = car)) + geom_text()

	library(ggrepel)
	ggplot(mds, aes(V1, V2, label = car)) + geom_text_repel()

	## #############################################################################
	## H2O example for random forest and gbm

	## start and connect to H2O
	library(h2o)
	h2o.init()
	## in a browser check http://localhost:54321

	## demo data
	library(hflights)
	rm(hflights)
	str(hflights)

	## copy to H2O
	hflights.hex <- as.h2o(hflights, 'hflights')

	str(hflights.hex)
	str(head(hflights.hex))
	head(hflights.hex, 3)
	summary(hflights.hex)
	## check flight number: numeric?
	## check on H2O web interface as well (enum vs factor)

	## convert numeric to factor/enum
	hflights.hex[, 'FlightNum'] <- as.factor(hflights.hex[, 'FlightNum'])
	summary(hflights.hex)

	## boring
	for (v in c('Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'ArrTime', 'Cancelled')) {
	hflights.hex[, v] <- as.factor(hflights.hex[, v])
	}

	summary(hflights.hex)

	## feature engineering: departure time? is it OK? hour of the day?
	## redo everything... just use the R script
	library(data.table)
	hflights <- data.table(hflights)
	hflights[, hour := substr(DepTime, 1, 2)]
	hflights[, .N, by = hour]

	hflights[, hour := substr(DepTime, 1, nchar(DepTime) - 2)]
	hflights[, .N, by = hour][order(hour)]
	hflights[hour == '']

	hflights[, hour := cut(as.numeric(hour), seq(0, 24, 4))]
	hflights[, .N, by = hour]

	hflights[is.na(hour)]
	hflights[, .N, by = .(is.na(hour), Cancelled == 1)]

	## drop columns
	hflights <- hflights[, .(Month, DayofMonth, DayOfWeek, Dest, Origin,
	UniqueCarrier, FlightNum, TailNum, Distance,
	Cancelled = factor(Cancelled))]

	## re-upload to H2O
	h2o.ls()
	hflights.hex <- as.h2o(as.data.frame(hflights), 'hflights')

	## split the data
	h2o.splitFrame(data = hflights.hex , ratios = 0.75,
	destination_frames = paste0('hflights', 0:1))
	h2o.ls()

	## build the first model
	hflights.rf <- h2o.randomForest(
	model_id = 'hflights_rf',
	x = setdiff(names(hflights), 'Cancelled'),
	y = 'Cancelled',
	training_frame = 'hflights0',
	validation_frame = 'hflights1')
	hflights.rf

	## more trees
	hflights.rf <- h2o.randomForest(
	model_id = 'hflights_rf500',
	x = setdiff(names(hflights), 'Cancelled'),
	y = 'Cancelled',
	training_frame = 'hflights0',
	validation_frame = 'hflights1', ntrees = 500)

	## change to UI and check ROC curve & AUC

	## GBM
	hflights.gbm <- h2o.gbm(
	x = setdiff(names(hflights), 'Cancelled'),
	y = 'Cancelled',
	training_frame = 'hflights0',
	validation_frame = 'hflights1',
	model_id = 'hflights_gbm')

	## more trees should help, again !!!
	hflights.gbm <- h2o.gbm(
	x = setdiff(names(hflights), 'Cancelled'),
	y = 'Cancelled',
	training_frame = 'hflights0',
	validation_frame = 'hflights1',
	model_id = 'hflights_gbm2', ntrees = 550)

	## but no: although higher training AUC, lower validation AUC => overfit

	## more trees should help, again !!!
	hflights.gbm <- h2o.gbm(
	x = setdiff(names(hflights), 'Cancelled'),
	y = 'Cancelled',
	training_frame = 'hflights_part0',
	validation_frame = 'hflights_part1',
	model_id = 'hflights_gbm2', ntrees = 250, learn_rate = 0.01)

	## bye
	h2o.shutdown()