daroczig/Budapest-BI-Forum-2017.r

## Budapest-BI-Forum-2017.r
## intro slides: http://bit.ly/r-intro-slide

## basic operations
1 + 3
3*2
3^2

## constants
pi
"pi"
'pi'
letters
LETTERS
month.name
month.abb

## help
?pi
apropos('month')
apropos('cor')

## variables
x = 4
x <- 4
x

x / 2
x^2
x^0.5

## functions
sqrt(x)
sqrt(x = x)
sin(pi / 2)
round(pi)
ceiling(pi)
floor(pi)

## custom functions
f <- function(x) 2 * x + 1
f(5)
f(pi)

## vectors
c(1, 2, 3, 4, 5)
1:5
seq(1, 5)
?seq
seq(1, 5, 0.1)

x <- seq(1, 5, 0.1)
plot(x, f(x))
plot(x, f(x), type = 'l')

plot(x, f(x), type = 'l', xlab = '', main = '2x+1')
grid()


## TODO plot 1 period of sine curve
?sin
f <- function(x) sin(x)

x <- seq(0, pi * 2, 0.1)
plot(x, sin(x), type = 'l')

## a simpler way for plotting functions
curve(f)
?curve

curve(2*x + 1, from = 0, to = 50)
curve(x + 1, add = TRUE, col = 'red')

## TODO plot 1 period of sine curve
curve(sin, from = 0, to = 2*pi)

## TODO Brownian motion in 1D
runif(1)
runif(5)

## random numbers from uniform distribution
round(runif(5))
round(runif(5))*2 - 1

cumsum(round(runif(25))*2 - 1)

plot(cumsum(round(runif(25))*2 - 1), type = 's')

set.seed(42)
plot(cumsum(round(runif(25))*2 - 1), type = 's')

for (i in 2:6) {
    lines(cumsum(round(runif(25))*2 - 1), type = 's', col = i)
}

## pro tipp #1
plot(c(1, 25), c(-25, 25), type = 'n')
plot(NULL, NULL, xlim = c(1, 25), ylim = c(-25, 25))

## pro tipp #2
sample(c(-1, 1), 25, replace = TRUE)

################################################################################
## custom vectors -> combine values into vector
h <- c(174, 170, 160)
h
(w <- c(90, 80, 70))

plot(h, w)
plot(h, w, main = "Demo", xlab = 'Height', ylab = 'Weight')

cor(w, h)

lm(w ~ h) # formula notation
fit <- lm(w ~ h)
summary(fit)

plot(h, w, main = "Demo", xlab = 'Height', ylab = 'Weight')
abline(fit, col = 'red')

## some basic modeling
predict(fit, list(h = c(56, 104)))

56 * fit$coefficients[2] + fit$coefficients[1]
56 * 1.3462 + -146.1538

## why?
plot(h, w, main = "Demo", xlab = 'Height', ylab = 'Weight',
     xlim = c(0, max(h)),
     ylim = c(0, max(w)))
abline(fit, col = 'red')

## polynomial model
fit <- lm(w ~ poly(h, 2, raw = TRUE))
fit <- lm(w ~ h + I(h^2))
fit

2824 + 174 * -34.3571 + (174^2)*0.1071

plot(h, w)
lines(h, predict(fit), col = 'red')

predict(fit, data.frame(h = c(56, 104)))

plot(0:200, predict(fit, list(h = 0:200)), type = 'l')
abline(v = c(160, 174), col = 'red')

## data.frame
df <- data.frame(weight = w, height = h)
df
str(df)
df$weight
?'$'

plot(df)
cor(df)

lm(weight ~ height, data = df)

str(df)
df[i, j]
df[1, ]
df[, 1]
df[3, 2]
df$weight
df$weight[2]

## Body Mass Index (BMI) is a person's weight in kilograms divided by the square of height in meters.
df$height <- df$height/100
df
df$bmi <- df$weight / df$height^2
df
df$bmi <- df$weight / (df$height/100)^2

## import data
df <- read.csv('http://bit.ly/heightweight')
str(df)

## TODO
df$height <- df$heightIn * 2.54
df$weight <- df$weightLb * 0.45
str(df)

df$heightIn <- df$weightLb <- NULL
str(df)

## descriptive stats
min(df$ageYear)
max(df$ageYear)
range(df$ageYear)
sum(df$weightLb)
length(df$weight)
nrow(df)
ncol(df)
dim(df)

## TODO plot and analyze association between height and weight
plot(df$height, df$weight)
abline(lm(weight ~ height, data = df), col = 'red')

## TODO compute and plot BMI index
df$bmi <- df$weight / (df$height/100)^2
hist(df$bmi)

## exploratory data analysis
pairs(df)

library(GGally)
ggpairs(df)

library(pairsD3)
pairsD3(df)

## statistical test: t-test, ANOVA
t.test(height ~ sex, data = df)

aov(height ~ sex, data = df)
summary(aov(height ~ sex, data = df))
summary(aov(weight ~ sex, data = df))

## Post hoc tests => Tukey Honest Significant Differences
TukeyHSD(aov(height ~ sex, data = df))
TukeyHSD(aov(weight ~ sex, data = df))

## #############################################################################
## Wilkinson: The Grammar of Graphics
## Wickham: ggplot2 -> layer by layer:
##  - coordinate system,
##  - aesthetics (labels)
##  - scales
##  - geometries
##  - shapes
##  - statistics
##  - grid, color theme etc. + layers
## http://www.cookbook-r.com/Graphs/
## #############################################################################

library(ggplot2)
?diamonds

## barplot
ggplot(diamonds, aes(x = cut)) + geom_bar()
ggplot(diamonds, aes(x = cut)) + geom_bar() + theme_bw()
ggplot(diamonds, aes(x = cut)) +
    geom_bar(colour = "darkgreen", fill = "white") +
    theme_bw()

p <- ggplot(diamonds, aes(x = cut)) +
    geom_bar(colour = "darkgreen", fill = "white") + theme_bw()
p

## coord transformations
library(scales)
p + scale_y_log10()
p + scale_y_sqrt()
p + scale_y_reverse()
p + coord_flip()

## facet
p + facet_wrap( ~ clarity)
p + facet_grid(color ~ clarity)

## stacked, clustered bar charts
p <- ggplot(diamonds, aes(x = color, fill = cut))
p + geom_bar()
p + geom_bar(position = "fill")
p + geom_bar(position = "dodge")

## histogram
ggplot(diamonds, aes(x = price)) + geom_bar(binwidth = 100)
ggplot(diamonds, aes(x = price)) + geom_bar(binwidth = 1000)
ggplot(diamonds, aes(x = price)) + geom_bar(binwidth = 10000)

ggplot(diamonds, aes(price, fill = cut)) + geom_density(alpha = 0.2) + theme_bw()

## scatterplot
p <- ggplot(diamonds, aes(x = carat, y = price)) + geom_point()

library(hexbin)
p <- ggplot(diamonds, aes(x = carat, y = price)) + geom_hex()

## adding layers
p
p <- p + geom_smooth()
p
p + geom_smooth(method = "lm", se = FALSE, color = 'red')

ggplot(diamonds, aes(x = carat, y = price, colour = cut)) + geom_point() + geom_smooth()

## themes
library(ggthemes)
p + theme_economist() + scale_colour_economist()
p + theme_stata() + scale_colour_stata()
p + theme_excel() + scale_colour_excel()
p + theme_wsj() + scale_colour_wsj("colors6", "")
p + theme_gdocs() + scale_colour_gdocs()

library(ggthemr)

## Further reading at http://www.cookbook-r.com/Graphs/

## TODO on mtcars
##  * number of carburators
ggplot(mtcars, aes(x = carb)) + geom_bar()
##  * hp
ggplot(mtcars, aes(x = hp)) + geom_histogram()
##  * barplot of carburators per am
ggplot(mtcars, aes(x = carb)) + geom_bar() + facet_grid(. ~ am)
## stacked bar chart
ggplot(mtcars, aes(x = carb, fill = factor(am))) + geom_bar()
## grouped/dodged
ggplot(mtcars, aes(x = carb, fill = factor(am))) + geom_bar(position = 'dodge')
##  * boxplot of hp by carb
ggplot(mtcars, aes(x = factor(am), y = hp)) + geom_boxplot()
##  * hp and wt by carb
ggplot(mtcars, aes(x = hp, y = wt)) + geom_point() + facet_grid(. ~ carb)
##  * hp and wt by carb regression
ggplot(mtcars, aes(x = hp, y = wt)) + geom_point() + facet_grid(. ~ carb) + geom_smooth()

## #############################################################################
## PCA demo on image processing
## #############################################################################

## http://www.nasa.gov/images/content/694811main_pia16225-43_full.jpg
## http://bit.ly/BudapestBI-R-img
download.file('http://bit.ly/BudapestBI-R-img', 'image.jpg')

library(jpeg)
img <- readJPEG('image.jpg')
str(img)

dim(img)
h <- dim(img)[1]
w <- dim(img)[2]
h;w

img1d <- matrix(img, h * w)
str(img1d)

pca <- prcomp(img1d)
pca
summary(pca)

image(matrix(pca$x[, 1], h))
image(matrix(pca$x[, 2], h))
image(matrix(pca$x[, 2], h), col = gray.colors(100))

image(matrix(pca$x[, 3], h))

## #############################################################################
## MDS
## #############################################################################

## distance between 40 Hungarian cities -> 2D scatterplot
download.file('http://bit.ly/CEU-R-distances', 'cities.xls')

library(readxl)
cities <- read_excel('cities.xls')
str(cities)

## get rid of 1st column and last row (metadata)
cities <- cities[, -1]
cities <- cities[-nrow(cities), ]

mds <- cmdscale(as.dist(cities))
mds

plot(mds)
text(mds[, 1], mds[, 2], names(cities))

## flipping both x and y axis
mds <- -mds
plot(mds)
text(mds[, 1], mds[, 2], names(cities))

## distance between European cities -> 2D scatterplot on ggplot

mds <- cmdscale(eurodist)
mds

mds <- as.data.frame(mds)
library(ggplot2)
ggplot(mds, aes(V1, -V2, label = rownames(mds))) + geom_text()

## non-geo stuff
str(mtcars)
?mtcars
mds <- cmdscale(dist(mtcars))
plot(mds)
text(mds[, 1], mds[, 2], rownames(mtcars))

mds <- as.data.frame(mds)
library(ggplot2)
ggplot(mds, aes(V1, -V2, label = rownames(mtcars))) +
    geom_text() + theme_bw()

library(ggrepel)
ggplot(mds, aes(V1, -V2, label = rownames(mtcars))) +
    geom_text_repel() + theme_bw()

## #############################################################################
## feature selection - same in 2D
## #############################################################################

str(iris)
plot(iris$Sepal.Length, iris$Sepal.Width)

fit <- lm(Sepal.Width ~ Sepal.Length, data = iris)
fit
summary(fit)

plot(iris$Sepal.Length, iris$Sepal.Width)
abline(fit, col = 'red', lwd = 5)

## TODO improve the model
summary(lm(Sepal.Width ~ Sepal.Length + Species,
           data = iris))

library(ggplot2)
ggplot(iris, aes(Sepal.Length, Sepal.Width,
                 col = Species)) +
    geom_point() + geom_smooth(method = 'lm')

ggplot(iris, aes(Sepal.Length, Sepal.Width)) +
    geom_point(aes(col = Species)) +
    geom_smooth(aes(col = Species), method = 'lm') +
    geom_smooth(method = 'lm', col = 'black') + theme_bw()

## #############################################################################
## PCA demo to show the steps in hierarchical clustering
## #############################################################################

PC <- prcomp(iris[, 1:4])$x
dm <- dist(iris[, 1:4])
hc <- hclust(dm)
plot(hc)

for (i in 2:8) {
    plot(hc)
    rect.hclust(hc, k = i, border = 'red')
    Sys.sleep(1)
}

library(animation)
ani.options(interval = 1)
saveGIF({
    for (i in 2:8) {
        plot(hc)
        rect.hclust(hc, k = i, border = 'red')
    }
})


library(animation)
ani.options(interval = 1)
saveGIF({
    for (i in 1:8) {
        plot(-PC[, 1:2],
             col = cutree(hc, i),
             pch = as.numeric(factor(iris$Species)) + 5)
    }
})

## #############################################################################
## basic decision trees
## #############################################################################

rm(iris)
set.seed(3)
iris$rnd <- runif(150)
iris <- iris[order(iris$rnd),]
iris
iris$rnd <- NULL

train <- iris[1:100, ]
test  <- iris[101:150, ]

library(rpart)
ct <- rpart(Species ~ ., data = train)
summary(ct)

plot(ct); text(ct)

library(partykit)
plot(as.party(ct))

predict(ct, newdata = test)
predict(ct, newdata = test, type = 'class')

## confusion matrix
table(test$Species,
      predict(ct, newdata = test,
              type = 'class'))

ct <- rpart(Species ~ ., data = train,
            minsplit = 3)
## ?rpart.control
plot(ct); text(ct)

table(test$Species,
      predict(ct, newdata = test,
              type = 'class'))
	## intro slides: http://bit.ly/r-intro-slide

	## basic operations
	1 + 3
	3*2
	3^2

	## constants
	pi
	"pi"
	'pi'
	letters
	LETTERS
	month.name
	month.abb

	## help
	?pi
	apropos('month')
	apropos('cor')

	## variables
	x = 4
	x <- 4
	x

	x / 2
	x^2
	x^0.5

	## functions
	sqrt(x)
	sqrt(x = x)
	sin(pi / 2)
	round(pi)
	ceiling(pi)
	floor(pi)

	## custom functions
	f <- function(x) 2 * x + 1
	f(5)
	f(pi)

	## vectors
	c(1, 2, 3, 4, 5)
	1:5
	seq(1, 5)
	?seq
	seq(1, 5, 0.1)

	x <- seq(1, 5, 0.1)
	plot(x, f(x))
	plot(x, f(x), type = 'l')

	plot(x, f(x), type = 'l', xlab = '', main = '2x+1')
	grid()


	## TODO plot 1 period of sine curve
	?sin
	f <- function(x) sin(x)

	x <- seq(0, pi * 2, 0.1)
	plot(x, sin(x), type = 'l')

	## a simpler way for plotting functions
	curve(f)
	?curve

	curve(2*x + 1, from = 0, to = 50)
	curve(x + 1, add = TRUE, col = 'red')

	## TODO plot 1 period of sine curve
	curve(sin, from = 0, to = 2*pi)

	## TODO Brownian motion in 1D
	runif(1)
	runif(5)

	## random numbers from uniform distribution
	round(runif(5))
	round(runif(5))*2 - 1

	cumsum(round(runif(25))*2 - 1)

	plot(cumsum(round(runif(25))*2 - 1), type = 's')

	set.seed(42)
	plot(cumsum(round(runif(25))*2 - 1), type = 's')

	for (i in 2:6) {
	lines(cumsum(round(runif(25))*2 - 1), type = 's', col = i)
	}

	## pro tipp #1
	plot(c(1, 25), c(-25, 25), type = 'n')
	plot(NULL, NULL, xlim = c(1, 25), ylim = c(-25, 25))

	## pro tipp #2
	sample(c(-1, 1), 25, replace = TRUE)

	################################################################################
	## custom vectors -> combine values into vector
	h <- c(174, 170, 160)
	h
	(w <- c(90, 80, 70))

	plot(h, w)
	plot(h, w, main = "Demo", xlab = 'Height', ylab = 'Weight')

	cor(w, h)

	lm(w ~ h) # formula notation
	fit <- lm(w ~ h)
	summary(fit)

	plot(h, w, main = "Demo", xlab = 'Height', ylab = 'Weight')
	abline(fit, col = 'red')

	## some basic modeling
	predict(fit, list(h = c(56, 104)))

	56 * fit$coefficients[2] + fit$coefficients[1]
	56 * 1.3462 + -146.1538

	## why?
	plot(h, w, main = "Demo", xlab = 'Height', ylab = 'Weight',
	xlim = c(0, max(h)),
	ylim = c(0, max(w)))
	abline(fit, col = 'red')

	## polynomial model
	fit <- lm(w ~ poly(h, 2, raw = TRUE))
	fit <- lm(w ~ h + I(h^2))
	fit

	2824 + 174 * -34.3571 + (174^2)*0.1071

	plot(h, w)
	lines(h, predict(fit), col = 'red')

	predict(fit, data.frame(h = c(56, 104)))

	plot(0:200, predict(fit, list(h = 0:200)), type = 'l')
	abline(v = c(160, 174), col = 'red')

	## data.frame
	df <- data.frame(weight = w, height = h)
	df
	str(df)
	df$weight
	?'$'

	plot(df)
	cor(df)

	lm(weight ~ height, data = df)

	str(df)
	df[i, j]
	df[1, ]
	df[, 1]
	df[3, 2]
	df$weight
	df$weight[2]

	## Body Mass Index (BMI) is a person's weight in kilograms divided by the square of height in meters.
	df$height <- df$height/100
	df
	df$bmi <- df$weight / df$height^2
	df
	df$bmi <- df$weight / (df$height/100)^2

	## import data
	df <- read.csv('http://bit.ly/heightweight')
	str(df)

	## TODO
	df$height <- df$heightIn * 2.54
	df$weight <- df$weightLb * 0.45
	str(df)

	df$heightIn <- df$weightLb <- NULL
	str(df)

	## descriptive stats
	min(df$ageYear)
	max(df$ageYear)
	range(df$ageYear)
	sum(df$weightLb)
	length(df$weight)
	nrow(df)
	ncol(df)
	dim(df)

	## TODO plot and analyze association between height and weight
	plot(df$height, df$weight)
	abline(lm(weight ~ height, data = df), col = 'red')

	## TODO compute and plot BMI index
	df$bmi <- df$weight / (df$height/100)^2
	hist(df$bmi)

	## exploratory data analysis
	pairs(df)

	library(GGally)
	ggpairs(df)

	library(pairsD3)
	pairsD3(df)

	## statistical test: t-test, ANOVA
	t.test(height ~ sex, data = df)

	aov(height ~ sex, data = df)
	summary(aov(height ~ sex, data = df))
	summary(aov(weight ~ sex, data = df))

	## Post hoc tests => Tukey Honest Significant Differences
	TukeyHSD(aov(height ~ sex, data = df))
	TukeyHSD(aov(weight ~ sex, data = df))

	## #############################################################################
	## Wilkinson: The Grammar of Graphics
	## Wickham: ggplot2 -> layer by layer:
	## - coordinate system,
	## - aesthetics (labels)
	## - scales
	## - geometries
	## - shapes
	## - statistics
	## - grid, color theme etc. + layers
	## http://www.cookbook-r.com/Graphs/
	## #############################################################################

	library(ggplot2)
	?diamonds

	## barplot
	ggplot(diamonds, aes(x = cut)) + geom_bar()
	ggplot(diamonds, aes(x = cut)) + geom_bar() + theme_bw()
	ggplot(diamonds, aes(x = cut)) +
	geom_bar(colour = "darkgreen", fill = "white") +
	theme_bw()

	p <- ggplot(diamonds, aes(x = cut)) +
	geom_bar(colour = "darkgreen", fill = "white") + theme_bw()
	p

	## coord transformations
	library(scales)
	p + scale_y_log10()
	p + scale_y_sqrt()
	p + scale_y_reverse()
	p + coord_flip()

	## facet
	p + facet_wrap( ~ clarity)
	p + facet_grid(color ~ clarity)

	## stacked, clustered bar charts
	p <- ggplot(diamonds, aes(x = color, fill = cut))
	p + geom_bar()
	p + geom_bar(position = "fill")
	p + geom_bar(position = "dodge")

	## histogram
	ggplot(diamonds, aes(x = price)) + geom_bar(binwidth = 100)
	ggplot(diamonds, aes(x = price)) + geom_bar(binwidth = 1000)
	ggplot(diamonds, aes(x = price)) + geom_bar(binwidth = 10000)

	ggplot(diamonds, aes(price, fill = cut)) + geom_density(alpha = 0.2) + theme_bw()

	## scatterplot
	p <- ggplot(diamonds, aes(x = carat, y = price)) + geom_point()

	library(hexbin)
	p <- ggplot(diamonds, aes(x = carat, y = price)) + geom_hex()

	## adding layers
	p
	p <- p + geom_smooth()
	p
	p + geom_smooth(method = "lm", se = FALSE, color = 'red')

	ggplot(diamonds, aes(x = carat, y = price, colour = cut)) + geom_point() + geom_smooth()

	## themes
	library(ggthemes)
	p + theme_economist() + scale_colour_economist()
	p + theme_stata() + scale_colour_stata()
	p + theme_excel() + scale_colour_excel()
	p + theme_wsj() + scale_colour_wsj("colors6", "")
	p + theme_gdocs() + scale_colour_gdocs()

	library(ggthemr)

	## Further reading at http://www.cookbook-r.com/Graphs/

	## TODO on mtcars
	## * number of carburators
	ggplot(mtcars, aes(x = carb)) + geom_bar()
	## * hp
	ggplot(mtcars, aes(x = hp)) + geom_histogram()
	## * barplot of carburators per am
	ggplot(mtcars, aes(x = carb)) + geom_bar() + facet_grid(. ~ am)
	## stacked bar chart
	ggplot(mtcars, aes(x = carb, fill = factor(am))) + geom_bar()
	## grouped/dodged
	ggplot(mtcars, aes(x = carb, fill = factor(am))) + geom_bar(position = 'dodge')
	## * boxplot of hp by carb
	ggplot(mtcars, aes(x = factor(am), y = hp)) + geom_boxplot()
	## * hp and wt by carb
	ggplot(mtcars, aes(x = hp, y = wt)) + geom_point() + facet_grid(. ~ carb)
	## * hp and wt by carb regression
	ggplot(mtcars, aes(x = hp, y = wt)) + geom_point() + facet_grid(. ~ carb) + geom_smooth()

	## #############################################################################
	## PCA demo on image processing
	## #############################################################################

	## http://www.nasa.gov/images/content/694811main_pia16225-43_full.jpg
	## http://bit.ly/BudapestBI-R-img
	download.file('http://bit.ly/BudapestBI-R-img', 'image.jpg')

	library(jpeg)
	img <- readJPEG('image.jpg')
	str(img)

	dim(img)
	h <- dim(img)[1]
	w <- dim(img)[2]
	h;w

	img1d <- matrix(img, h * w)
	str(img1d)

	pca <- prcomp(img1d)
	pca
	summary(pca)

	image(matrix(pca$x[, 1], h))
	image(matrix(pca$x[, 2], h))
	image(matrix(pca$x[, 2], h), col = gray.colors(100))

	image(matrix(pca$x[, 3], h))

	## #############################################################################
	## MDS
	## #############################################################################

	## distance between 40 Hungarian cities -> 2D scatterplot
	download.file('http://bit.ly/CEU-R-distances', 'cities.xls')

	library(readxl)
	cities <- read_excel('cities.xls')
	str(cities)

	## get rid of 1st column and last row (metadata)
	cities <- cities[, -1]
	cities <- cities[-nrow(cities), ]

	mds <- cmdscale(as.dist(cities))
	mds

	plot(mds)
	text(mds[, 1], mds[, 2], names(cities))

	## flipping both x and y axis
	mds <- -mds
	plot(mds)
	text(mds[, 1], mds[, 2], names(cities))

	## distance between European cities -> 2D scatterplot on ggplot

	mds <- cmdscale(eurodist)
	mds

	mds <- as.data.frame(mds)
	library(ggplot2)
	ggplot(mds, aes(V1, -V2, label = rownames(mds))) + geom_text()

	## non-geo stuff
	str(mtcars)
	?mtcars
	mds <- cmdscale(dist(mtcars))
	plot(mds)
	text(mds[, 1], mds[, 2], rownames(mtcars))

	mds <- as.data.frame(mds)
	library(ggplot2)
	ggplot(mds, aes(V1, -V2, label = rownames(mtcars))) +
	geom_text() + theme_bw()

	library(ggrepel)
	ggplot(mds, aes(V1, -V2, label = rownames(mtcars))) +
	geom_text_repel() + theme_bw()

	## #############################################################################
	## feature selection - same in 2D
	## #############################################################################

	str(iris)
	plot(iris$Sepal.Length, iris$Sepal.Width)

	fit <- lm(Sepal.Width ~ Sepal.Length, data = iris)
	fit
	summary(fit)

	plot(iris$Sepal.Length, iris$Sepal.Width)
	abline(fit, col = 'red', lwd = 5)

	## TODO improve the model
	summary(lm(Sepal.Width ~ Sepal.Length + Species,
	data = iris))

	library(ggplot2)
	ggplot(iris, aes(Sepal.Length, Sepal.Width,
	col = Species)) +
	geom_point() + geom_smooth(method = 'lm')

	ggplot(iris, aes(Sepal.Length, Sepal.Width)) +
	geom_point(aes(col = Species)) +
	geom_smooth(aes(col = Species), method = 'lm') +
	geom_smooth(method = 'lm', col = 'black') + theme_bw()

	## #############################################################################
	## PCA demo to show the steps in hierarchical clustering
	## #############################################################################

	PC <- prcomp(iris[, 1:4])$x
	dm <- dist(iris[, 1:4])
	hc <- hclust(dm)
	plot(hc)

	for (i in 2:8) {
	plot(hc)
	rect.hclust(hc, k = i, border = 'red')
	Sys.sleep(1)
	}

	library(animation)
	ani.options(interval = 1)
	saveGIF({
	for (i in 2:8) {
	plot(hc)
	rect.hclust(hc, k = i, border = 'red')
	}
	})


	library(animation)
	ani.options(interval = 1)
	saveGIF({
	for (i in 1:8) {
	plot(-PC[, 1:2],
	col = cutree(hc, i),
	pch = as.numeric(factor(iris$Species)) + 5)
	}
	})

	## #############################################################################
	## basic decision trees
	## #############################################################################

	rm(iris)
	set.seed(3)
	iris$rnd <- runif(150)
	iris <- iris[order(iris$rnd),]
	iris
	iris$rnd <- NULL

	train <- iris[1:100, ]
	test <- iris[101:150, ]

	library(rpart)
	ct <- rpart(Species ~ ., data = train)
	summary(ct)

	plot(ct); text(ct)

	library(partykit)
	plot(as.party(ct))

	predict(ct, newdata = test)
	predict(ct, newdata = test, type = 'class')

	## confusion matrix
	table(test$Species,
	predict(ct, newdata = test,
	type = 'class'))

	ct <- rpart(Species ~ ., data = train,
	minsplit = 3)
	## ?rpart.control
	plot(ct); text(ct)

	table(test$Species,
	predict(ct, newdata = test,
	type = 'class'))