Skip to content

Instantly share code, notes, and snippets.

@daroczig
Created November 3, 2018 23:34
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save daroczig/445ec3d6df1e35c55e017bce5c394fc2 to your computer and use it in GitHub Desktop.
Save daroczig/445ec3d6df1e35c55e017bce5c394fc2 to your computer and use it in GitHub Desktop.
Code presented at the R workshop of the Crunch 2018 conference: http://crunchconf.com
## #############################################################################
## intro slides: http://bit.ly/CRUNCH-R-2018
## #############################################################################
## intro to R
## basic operations
1 + 3
3 * 2
## constants
pi
"pi"
'pi'
## getting help
?pi
month.name
month.abb
apropos('month')
## vectors
letters
LETTERS
str(letters)
## custom vectors
letters[1]
letters[5]
letters[1:5]
1:5
seq(1, 5)
?seq
seq(1, 5, 0.1)
seq(to = 5, by = 0.1, from = 1)
## variables
x = 4 # works, but not suggested
x
x <- 4 # works, and recommened
x * 2
x ^ 2
## TODO compute the square root of x
sqrt(x)
x ^ 0.5
## functions
sin(pi / 2)
round(pi)
?round
round(pi, digits = 2)
ceiling(pi)
floor(pi)
runif(1)
?runif
runif(5)
## custom functions
f <- function(x) 2 * x + 1
f <- function(x) {
2 * x + 1
}
f(5)
f(pi)
## vectorized operations
f(1:5)
x <- seq(1, 5, 0.1)
x
f(x)
## visualize function
plot(x, f(x))
?plot
plot(x, f(x), type = 'l')
plot(x, f(x), type = 'b')
## TODO draw 1 period of sine curve / wave
?sin
f <- function(x) sin(x)
x <- seq(0, pi * 2, by = 0.1)
plot(x, sin(x), type = 'l', xlim = c(2, 4))
## a simpler way for plotting functions
?curve
curve(f, from = 0, to = pi * 2)
grid()
## TODO Brownian motion / random walk in 1D
set.seed(42)
x <- 0
for (i in 1:500) {
x <- x + sign(runif(1) - 0.5)
}
x
## Rcpp
round(runif(1)) - 0.5
?sign
sign(runif(10) - 0.5)
sign(runif(1, min = -0.5, max = 0.5))
cumsum(sign(runif(25, min = -0.5, max = 0.5)))
x <- sign(runif(25, min = -0.5, max = 0.5))
x
cumsum(x)
cumsum(x)[25]
cumsum(x)[length(x)]
tail(cumsum(x), 1)
sum(x)
plot(1:25, cumsum(sign(runif(25, min = -0.5, max = 0.5))), type = 's')
for (i in 1:5) {
lines(cumsum(sign(runif(25, min = -0.5, max = 0.5))), type = 's', col = i)
}
## pro tipp #1
plot(c(1, 25), c(-25, 25), type = 'n')
plot(NULL, NULL, xlim = c(1, 25), ylim = c(-25, 25))
## pro tipp #2
sample(c(-1, 1), 25, replace = TRUE)
## custom vectors -> combine values into vector
h <- c(174, 170, 160)
w <- c(90, 80, 70)
plot(h, w)
plot(h, w, main = 'Heights and weights', xlab = 'Height', ylab = 'Weight')
## some basic modeling
cor(h, w)
?lm
lm(w ~ h) # formula notation => predicting w based on h
172 * 1.346 - 146.154
fit <- lm(w ~ h)
lm(h ~ w)
fit
predict(fit, newdata = list(h = 172))
str(fit)
fit
summary(fit)
predict(fit, newdata = list(h = 172))
predict(fit, newdata = list(h = c(56, 104)))
## we need more data
## we need more complex models
## ...
## extrapolating!
## TODO visualize the lm
plot(h, w, xlim = c(0, 250), ylim = c(0, 150))
fit
abline(fit, col = 'red')
## example for polynomial model
fit <- lm(w ~ poly(h, 2, raw = TRUE))
fit <- lm(w ~ h + I(h^2))
fit
## #############################################################################
## intro to data.frame
df <- data.frame(weight = w, height = h)
df
str(df)
df[1, 2]
df[2, 2]
df[1, ]
df[, 2]
str(df)
df$weight
df[, 1]
df$weight[2]
plot(df)
cor(df)
## TODO let's create a 3rd column: BMI
## Body Mass Index (BMI) is a person's weight in kilograms divided by the square of height in meters
df
df$bmi <- df$weight / (df$height / 100) ^ 2
df
## get more data
?read.csv
df <- read.csv('http://bit.ly/heightweight')
str(df)
cor(df[, 2:5])
cor(df[, -1])
names(df)
plot(df)
## drop (redundant) column
df$ageMonth <- NULL
str(df)
## more EDA
install.packages('GGally')
library(GGally)
pairs(df)
ggpairs(df)
library(pairsD3)
pairsD3(df)
summary(df)
min(df$ageYear)
max(df$ageYear)
mean(df$ageYear)
median(df$ageYear)
sd(df$ageYear)
length(df$ageYear)
nrow(df)
ncol(df)
dim(df)
c(nrow(df), ncol(df))
range(df$ageYear)
diff(range(df$ageYear))
## TODO compute BMI (kg, cm)
df$height <- df$heightIn * 2.54
df$weight <- df$weightLb * 0.45
df$bmi <- df$weight / (df$height / 100) ^ 2
## TODO visualize the lm predicting weight based on height
lm(df$weight ~ df$height)
fit <- lm(weight ~ height, data = df)
plot(df$height, df$weight)
abline(fit, col = 'red')
## TODO check if there's a diff in the height of m/f
t.test(height ~ sex, data = df)
t.test(weight ~ sex, data = df)
?t.test
?aov
?TukeyHSD
aov(height ~ sex, data = df)
summary(aov(height ~ sex, data = df))
summary(aov(weight ~ sex, data = df))
TukeyHSD(aov(height ~ sex, data = df))
TukeyHSD(aov(weight ~ sex, data = df))
## #############################################################################
## ggplot2: The R implementation of the Grammar of Graphics
library(ggplot2)
?diamonds
str(diamonds)
## barplot
ggplot(diamonds, aes(x = cut)) + geom_bar()
ggplot(diamonds, aes(x = cut)) + geom_bar() + theme_bw()
ggplot(diamonds, aes(x = cut)) +
geom_bar(colour = 'orange', fill = 'white') +
theme_bw()
p <- ggplot(diamonds, aes(x = cut))
p
p + geom_bar()
p <- p + geom_bar()
p
p + theme_bw()
## coord transformations
p
p + scale_y_log10()
p + scale_y_sqrt()
p + scale_y_reverse()
p + coord_flip()
## facets
p
p + facet_wrap( ~ clarity)
str(diamonds)
p + facet_grid(color ~ clarity, scales = 'free')
## formula: lhs ~ rhs1 + rhs2
## histogram
ggplot(diamonds, aes(x = price)) + geom_histogram(binwidth = 10)
ggplot(diamonds, aes(x = price)) + geom_histogram(binwidth = 100)
ggplot(diamonds, aes(x = price)) + geom_histogram(binwidth = 1000)
ggplot(diamonds, aes(x = price)) + geom_histogram(binwidth = 10000)
## back to barplots: stacked and clustered bar charts
ggplot(diamonds, aes(x = cut)) + geom_bar(fill = 'white')
ggplot(diamonds, aes(x = cut)) + geom_bar(aes(fill = clarity))
ggplot(diamonds, aes(x = cut, fill = clarity)) + geom_bar()
p <- ggplot(diamonds, aes(x = cut, fill = clarity))
p + geom_bar()
p + geom_bar(position = 'fill')
p + geom_bar(position = 'dodge')
## density function
ggplot(diamonds, aes(x = price)) + geom_histogram(binwidth = 1000)
ggplot(diamonds, aes(x = price, fill = cut)) +
geom_histogram(binwidth = 1000)
ggplot(diamonds, aes(price, fill = cut)) + geom_density()
ggplot(diamonds, aes(price, fill = cut)) + geom_density(alpha = 0.25)
## scatterplot
ggplot(diamonds, aes(x = carat, y = price)) + geom_point()
ggplot(diamonds, aes(x = carat, y = price)) + geom_point(alpha = 0.1)
## right geom?
install.packages('hexbin')
ggplot(diamonds, aes(x = carat, y = price)) + geom_hex()
ggplot(diamonds, aes(x = carat, y = price)) + geom_hex() +
geom_smooth(method = 'lm') +
geom_smooth(color = 'red')
## themes
library(ggthemes)
p <- ggplot(diamonds, aes(carat, price, color = cut)) + geom_point()
p
p + theme_economist() + scale_colour_economist()
p + theme_stata() + scale_colour_stata()
p + theme_excel() + scale_colour_excel()
p + theme_wsj() + scale_colour_wsj("colors6", "")
p + theme_gdocs() + scale_colour_gdocs()
p
p + theme_bw()
theme_bw
p + theme(axis.text.y = element_text(angle = 90, hjust = 0.5))
## create a custom theme for future usage
theme_custom <- function() {
theme(
axis.text = element_text(
family = 'Times New Roman',
color = "orange",
size = 12,
face = "italic"),
axis.title = element_text(
family = 'Times New Roman',
color = "orange",
size = 16,
face = "bold"),
axis.text.y = element_text(angle = 90, hjust = 0.5),
panel.background = element_rect(
fill = "orange",
color = "white",
size = 2)
)
}
p + theme_custom()
?mtcars
## TODO plot the distribution of horsepower
ggplot(mtcars, aes(x = hp)) + geom_histogram()
## TODO plot the distribution of the number carburators
ggplot(mtcars, aes(x = carb)) + geom_bar()
ggplot(mtcars, aes(x = factor(carb))) + geom_bar()
mtcars$carb <- factor(mtcars$carb)
## TODO barplot of carburators per am
ggplot(mtcars, aes(carb)) + geom_bar() + facet_wrap(~am)
ggplot(mtcars, aes(carb, fill = factor(am))) + geom_bar()
## TODO boxplot of hp by carb
ggplot(mtcars, aes(x = factor(carb), y = hp)) + geom_boxplot()
table(mtcars$carb)
## TODO plot hp and wt by carb
ggplot(mtcars, aes(hp, wt)) + geom_point() + facet_wrap(~carb)
## TODO plot regression (lm) on wt predicting hp
ggplot(mtcars, aes(wt, hp)) + geom_point() + geom_smooth(method = 'lm')
?geom_smooth
ggplot(mtcars, aes(wt, hp)) + geom_point() + geom_smooth(method = 'lm', se = FALSE)
## #############################################################################
## feature selection example
?iris
str(iris)
ggplot(iris, aes(Sepal.Length, Sepal.Width)) + geom_point()
ggplot(iris, aes(Sepal.Length, Sepal.Width)) + geom_point() +
geom_smooth(method = 'lm')
ggplot(iris, aes(Sepal.Length, Sepal.Width)) + geom_point() +
geom_smooth()
## Simpon's paradox
ggplot(iris, aes(Sepal.Length, Sepal.Width)) +
geom_point(aes(color = Species)) +
geom_smooth(aes(color = Species), method = 'lm') +
geom_smooth(method = 'lm', color = 'black') +
theme(legend.position = 'top')
summary(lm(Sepal.Width ~ Sepal.Length, data = iris))
summary(lm(Sepal.Width ~ Sepal.Length + Species, data = iris))
str(iris)
## #############################################################################
## hierachical clustering
?hclust
dm <- dist(iris[, 1:4])
str(dm)
hc <- hclust(dm)
plot(hc)
rect.hclust(hc, k = 3)
clusters <- cutree(hc, 3)
plot(iris$Sepal.Length, iris$Sepal.Width, col = iris$Species)
plot(iris$Sepal.Length, iris$Sepal.Width, col = iris$Species,
pch = clusters)
table(iris$Species)
table(clusters)
table(iris$Species, clusters)
for (i in 2:8) {
plot(hc)
rect.hclust(hc, k = i)
Sys.sleep(1)
}
library(animation)
ani.options(interval = 1)
saveGIF({
for (i in 2:8) {
plot(hc)
rect.hclust(hc, k = i)
}
})
saveGIF({
for (i in 2:8) {
clusters <- cutree(hc, i)
plot(iris$Sepal.Length, iris$Sepal.Width, col = clusters)
}
})
## #############################################################################
## supervised learning: a basic CART decision tree
str(iris)
install.packages('rpart')
library(rpart)
fit <- rpart(Species ~ Sepal.Length + Sepal.Width +
Petal.Length + Petal.Width, data = iris)
plot(fit)
text(fit)
library(partykit)
plot(as.party(fit))
fit
predict(fit)
predict(fit, type = 'class')
table(iris$Species, predict(fit, type = 'class'))
## #############################################################################
## Principal Component Analysis (PCA)
download.file('http://bit.ly/nasa-image-pca', 'image.jpg', mode = 'wb')
?download.file
library(jpeg)
img <- readJPEG('image.jpg')
str(img) ## RGB 0-1
dim(img)
h <- dim(img)[1]
w <- dim(img)[2]
h
w
img2d <- matrix(img, h * w)
str(img2d)
pca <- prcomp(img2d)
summary(pca)
str(pca)
image(matrix(pca$x[, 1], h))
image(matrix(pca$x[, 2], h))
## #############################################################################
## Multi-Dimensional Scaling (MDS)
download.file('http://bit.ly/hun-cities-distance',
'cities.xls')
library(readxl)
cities <- read_excel('cities.xls')
str(cities)
## cities$`Települések távolsága közúton (km)` <- NULL
cities[, 2:41]
str(cities[, -1])
cities <- cities[, -1]
str(cities)
cities[41, ]
cities <- cities[1:40, ]
str(cities)
mds <- cmdscale(as.dist(cities))
mds
plot(mds)
text(mds[, 1], mds[, 2], names(cities))
## use mds with non-geo stuff
str(mtcars)
mtcars
mds <- cmdscale(dist(mtcars))
mds
plot(mds)
text(mds[, 1], mds[, 2], rownames(mtcars))
library(ggplot2)
str(mds)
mds <- as.data.frame(mds)
str(mds)
mds$car <- rownames(mtcars)
str(mds)
ggplot(mds, aes(V1, V2, label = car)) + geom_text()
library(ggrepel)
ggplot(mds, aes(V1, V2, label = car)) + geom_text_repel()
## #############################################################################
## H2O example for random forest and gbm
## start and connect to H2O
library(h2o)
h2o.init()
## in a browser check http://localhost:54321
## demo data
library(hflights)
rm(hflights)
str(hflights)
## copy to H2O
hflights.hex <- as.h2o(hflights, 'hflights')
str(hflights.hex)
str(head(hflights.hex))
head(hflights.hex, 3)
summary(hflights.hex)
## check flight number: numeric?
## check on H2O web interface as well (enum vs factor)
## convert numeric to factor/enum
hflights.hex[, 'FlightNum'] <- as.factor(hflights.hex[, 'FlightNum'])
summary(hflights.hex)
## boring
for (v in c('Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'ArrTime', 'Cancelled')) {
hflights.hex[, v] <- as.factor(hflights.hex[, v])
}
summary(hflights.hex)
## feature engineering: departure time? is it OK? hour of the day?
## redo everything... just use the R script
library(data.table)
hflights <- data.table(hflights)
hflights[, hour := substr(DepTime, 1, 2)]
hflights[, .N, by = hour]
hflights[, hour := substr(DepTime, 1, nchar(DepTime) - 2)]
hflights[, .N, by = hour][order(hour)]
hflights[hour == '']
hflights[, hour := cut(as.numeric(hour), seq(0, 24, 4))]
hflights[, .N, by = hour]
hflights[is.na(hour)]
hflights[, .N, by = .(is.na(hour), Cancelled == 1)]
## drop columns
hflights <- hflights[, .(Month, DayofMonth, DayOfWeek, Dest, Origin,
UniqueCarrier, FlightNum, TailNum, Distance,
Cancelled = factor(Cancelled))]
## re-upload to H2O
h2o.ls()
hflights.hex <- as.h2o(as.data.frame(hflights), 'hflights')
## split the data
h2o.splitFrame(data = hflights.hex , ratios = 0.75,
destination_frames = paste0('hflights', 0:1))
h2o.ls()
## build the first model
hflights.rf <- h2o.randomForest(
model_id = 'hflights_rf',
x = setdiff(names(hflights), 'Cancelled'),
y = 'Cancelled',
training_frame = 'hflights0',
validation_frame = 'hflights1')
hflights.rf
## more trees
hflights.rf <- h2o.randomForest(
model_id = 'hflights_rf500',
x = setdiff(names(hflights), 'Cancelled'),
y = 'Cancelled',
training_frame = 'hflights0',
validation_frame = 'hflights1', ntrees = 500)
## change to UI and check ROC curve & AUC
## GBM
hflights.gbm <- h2o.gbm(
x = setdiff(names(hflights), 'Cancelled'),
y = 'Cancelled',
training_frame = 'hflights0',
validation_frame = 'hflights1',
model_id = 'hflights_gbm')
## more trees should help, again !!!
hflights.gbm <- h2o.gbm(
x = setdiff(names(hflights), 'Cancelled'),
y = 'Cancelled',
training_frame = 'hflights0',
validation_frame = 'hflights1',
model_id = 'hflights_gbm2', ntrees = 550)
## but no: although higher training AUC, lower validation AUC => overfit
## more trees should help, again !!!
hflights.gbm <- h2o.gbm(
x = setdiff(names(hflights), 'Cancelled'),
y = 'Cancelled',
training_frame = 'hflights_part0',
validation_frame = 'hflights_part1',
model_id = 'hflights_gbm2', ntrees = 250, learn_rate = 0.01)
## bye
h2o.shutdown()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment