Skip to content

Instantly share code, notes, and snippets.

@daroczig
Created October 19, 2017 12:28
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save daroczig/67479016434089dfbe3d57ce341a853b to your computer and use it in GitHub Desktop.
Save daroczig/67479016434089dfbe3d57ce341a853b to your computer and use it in GitHub Desktop.
Code presented at the R workshop of the CRUNCH 2017 conference: http://crunchconf.com
## intro slides: http://bit.ly/CRUNCH-R-2017
## basic operations
1 + 3
3*2
3^2
## constants
pi
"pi"
'pi'
letters
LETTERS
month.name
month.abb
## help
?pi
apropos('month')
apropos('cor')
## variables
x = 4
x <- 4
x
x / 2
x^2
x^0.5
## functions
sqrt(x)
sqrt(x = x)
sin(pi / 2)
round(pi)
ceiling(pi)
floor(pi)
## custom functions
f <- function(x) 2 * x + 1
f(5)
f(pi)
## vectors
c(1, 2, 3, 4, 5)
1:5
seq(1, 5)
?seq
seq(1, 5, 0.1)
x <- seq(1, 5, 0.1)
plot(x, f(x))
plot(x, f(x), type = 'l')
plot(x, f(x), type = 'l', xlab = '', main = '2x+1')
grid()
## TODO plot 1 period of sine curve
?sin
f <- function(x) sin(x)
x <- seq(0, pi * 2, 0.1)
plot(x, sin(x), type = 'l')
## a simpler way for plotting functions
curve(f)
?curve
curve(2*x + 1, from = 0, to = 50)
curve(x + 1, add = TRUE, col = 'red')
## TODO plot 1 period of sine curve
curve(sin, from = 0, to = 2*pi)
## TODO Brownian motion in 1D
runif(1)
runif(5)
## random numbers from uniform distribution
round(runif(5))
round(runif(5))*2 - 1
cumsum(round(runif(25))*2 - 1)
plot(cumsum(round(runif(25))*2 - 1), type = 's')
set.seed(42)
plot(cumsum(round(runif(25))*2 - 1), type = 's')
for (i in 2:6) {
lines(cumsum(round(runif(25))*2 - 1), type = 's', col = i)
}
## pro tipp #1
plot(c(1, 25), c(-25, 25), type = 'n')
plot(NULL, NULL, xlim = c(1, 25), ylim = c(-25, 25))
## pro tipp #2
sample(c(-1, 1), 25, replace = TRUE)
################################################################################
## custom vectors -> combine values into vector
h <- c(174, 170, 160)
h
(w <- c(90, 80, 70))
plot(h, w)
plot(h, w, main = "Demo", xlab = 'Height', ylab = 'Weight')
cor(w, h)
lm(w ~ h) # formula notation
fit <- lm(w ~ h)
summary(fit)
plot(h, w, main = "Demo", xlab = 'Height', ylab = 'Weight')
abline(fit, col = 'red')
## some basic modeling
predict(fit, list(h = c(56, 104)))
56 * fit$coefficients[2] + fit$coefficients[1]
56 * 1.3462 + -146.1538
## why?
plot(h, w, main = "Demo", xlab = 'Height', ylab = 'Weight',
xlim = c(0, max(h)),
ylim = c(0, max(w)))
abline(fit, col = 'red')
## polynomial model
fit <- lm(w ~ poly(h, 2, raw = TRUE))
fit <- lm(w ~ h + I(h^2))
fit
2824 + 174 * -34.3571 + (174^2)*0.1071
plot(h, w)
lines(h, predict(fit), col = 'red')
predict(fit, data.frame(h = c(56, 104)))
plot(0:200, predict(fit, list(h = 0:200)), type = 'l')
abline(v = c(160, 174), col = 'red')
## data.frame
df <- data.frame(weight = w, height = h)
df
str(df)
df$weight
?'$'
plot(df)
cor(df)
lm(weight ~ height, data = df)
str(df)
df[i, j]
df[1, ]
df[, 1]
df[3, 2]
df$weight
df$weight[2]
## Body Mass Index (BMI) is a person's weight in kilograms divided by the square of height in meters.
df$height <- df$height/100
df
df$bmi <- df$weight / df$height^2
df
df$bmi <- df$weight / (df$height/100)^2
## import data
df <- read.csv('http://bit.ly/heightweight')
str(df)
## TODO
df$height <- df$heightIn * 2.54
df$weight <- df$weightLb * 0.45
str(df)
df$heightIn <- df$weightLb <- NULL
str(df)
## descriptive stats
min(df$ageYear)
max(df$ageYear)
range(df$ageYear)
sum(df$weightLb)
length(df$weight)
nrow(df)
ncol(df)
dim(df)
## TODO plot and analyze association between height and weight
plot(df$height, df$weight)
abline(lm(weight ~ height, data = df), col = 'red')
## TODO compute and plot BMI index
df$bmi <- df$weight / (df$height/100)^2
hist(df$bmi)
## exploratory data analysis
pairs(df)
library(GGally)
ggpairs(df)
library(pairsD3)
pairsD3(df)
## statistical test: t-test, ANOVA
t.test(height ~ sex, data = df)
aov(height ~ sex, data = df)
summary(aov(height ~ sex, data = df))
summary(aov(weight ~ sex, data = df))
## Post hoc tests => Tukey Honest Significant Differences
TukeyHSD(aov(height ~ sex, data = df))
TukeyHSD(aov(weight ~ sex, data = df))
## #############################################################################
## Wilkinson: The Grammar of Graphics
## Wickham: ggplot2 -> layer by layer:
## - coordinate system,
## - aesthetics (labels)
## - scales
## - geometries
## - shapes
## - statistics
## - grid, color theme etc. + layers
## http://www.cookbook-r.com/Graphs/
## #############################################################################
library(ggplot2)
?diamonds
## barplot
ggplot(diamonds, aes(x = cut)) + geom_bar()
ggplot(diamonds, aes(x = cut)) + geom_bar() + theme_bw()
ggplot(diamonds, aes(x = cut)) +
geom_bar(colour = "darkgreen", fill = "white") +
theme_bw()
p <- ggplot(diamonds, aes(x = cut)) +
geom_bar(colour = "darkgreen", fill = "white") + theme_bw()
p
## coord transformations
library(scales)
p + scale_y_log10()
p + scale_y_sqrt()
p + scale_y_reverse()
p + coord_flip()
## facet
p + facet_wrap( ~ clarity)
p + facet_grid(color ~ clarity)
## stacked, clustered bar charts
p <- ggplot(diamonds, aes(x = color, fill = cut))
p + geom_bar()
p + geom_bar(position = "fill")
p + geom_bar(position = "dodge")
## histogram
ggplot(diamonds, aes(x = price)) + geom_bar(binwidth = 100)
ggplot(diamonds, aes(x = price)) + geom_bar(binwidth = 1000)
ggplot(diamonds, aes(x = price)) + geom_bar(binwidth = 10000)
ggplot(diamonds, aes(price, fill = cut)) + geom_density(alpha = 0.2) + theme_bw()
## scatterplot
p <- ggplot(diamonds, aes(x = carat, y = price)) + geom_point()
## adding layers
p
p <- p + geom_smooth()
p
p + geom_smooth(method = "lm", se = FALSE, color = 'red')
ggplot(diamonds, aes(x = carat, y = price, colour = cut)) + geom_point() + geom_smooth()
## themes
library(ggthemes)
p + theme_economist() + scale_colour_economist()
p + theme_stata() + scale_colour_stata()
p + theme_excel() + scale_colour_excel()
p + theme_wsj() + scale_colour_wsj("colors6", "")
p + theme_gdocs() + scale_colour_gdocs()
## Further reading at http://www.cookbook-r.com/Graphs/
## TODO on mtcars
## * number of carburators
ggplot(mtcars, aes(x = carb)) + geom_bar()
## * hp
ggplot(mtcars, aes(x = hp)) + geom_histogram()
## * barplot of carburators per am
ggplot(mtcars, aes(x = carb)) + geom_bar() + facet_grid(. ~ am)
## stacked bar chart
ggplot(mtcars, aes(x = carb, fill = factor(am))) + geom_bar()
## grouped/dodged
ggplot(mtcars, aes(x = carb, fill = factor(am))) + geom_bar(position = 'dodge')
## * boxplot of hp by carb
ggplot(mtcars, aes(x = factor(am), y = hp)) + geom_boxplot()
## * hp and wt by carb
ggplot(mtcars, aes(x = hp, y = wt)) + geom_point() + facet_grid(. ~ carb)
## * hp and wt by carb regression
ggplot(mtcars, aes(x = hp, y = wt)) + geom_point() + facet_grid(. ~ carb) + geom_smooth()
## #############################################################################
## PCA demo on image processing
## #############################################################################
## http://www.nasa.gov/images/content/694811main_pia16225-43_full.jpg
## http://bit.ly/BudapestBI-R-img
download.file('http://bit.ly/BudapestBI-R-img', 'image.jpg')
library(jpeg)
img <- readJPEG('image.jpg')
str(img)
dim(img)
h <- dim(img)[1]
w <- dim(img)[2]
h;w
img1d <- matrix(img, h * w)
str(img1d)
pca <- prcomp(img1d)
pca
summary(pca)
image(matrix(pca$x[, 1], h))
image(matrix(pca$x[, 2], h))
image(matrix(pca$x[, 2], h), col = gray.colors(100))
image(matrix(pca$x[, 3], h))
## #############################################################################
## MDS
## #############################################################################
## distance between 40 Hungarian cities -> 2D scatterplot
download.file('http://bit.ly/CEU-R-distances', 'cities.xls')
library(readxl)
cities <- read_excel('cities.xls')
str(cities)
## get rid of 1st column and last row (metadata)
cities <- cities[, -1]
cities <- cities[-nrow(cities), ]
mds <- cmdscale(as.dist(cities))
mds
plot(mds)
text(mds[, 1], mds[, 2], names(cities))
## flipping both x and y axis
mds <- -mds
plot(mds)
text(mds[, 1], mds[, 2], names(cities))
## distance between European cities -> 2D scatterplot on ggplot
mds <- cmdscale(eurodist)
mds
mds <- as.data.frame(mds)
library(ggplot2)
ggplot(mds, aes(V1, -V2, label = rownames(mds))) + geom_text()
## non-geo stuff
str(mtcars)
?mtcars
mds <- cmdscale(dist(mtcars))
plot(mds)
text(mds[, 1], mds[, 2], rownames(mtcars))
mds <- as.data.frame(mds)
library(ggplot2)
ggplot(mds, aes(V1, -V2, label = rownames(mtcars))) +
geom_text() + theme_bw()
library(ggrepel)
ggplot(mds, aes(V1, -V2, label = rownames(mtcars))) +
geom_text_repel() + theme_bw()
## #############################################################################
## feature selection - same in 2D
## #############################################################################
str(iris)
plot(iris$Sepal.Length, iris$Sepal.Width)
fit <- lm(Sepal.Width ~ Sepal.Length, data = iris)
fit
summary(fit)
plot(iris$Sepal.Length, iris$Sepal.Width)
abline(fit, col = 'red', lwd = 5)
## TODO improve the model
summary(lm(Sepal.Width ~ Sepal.Length + Species,
data = iris))
library(ggplot2)
ggplot(iris, aes(Sepal.Length, Sepal.Width,
col = Species)) +
geom_point() + geom_smooth(method = 'lm')
ggplot(iris, aes(Sepal.Length, Sepal.Width)) +
geom_point(aes(col = Species)) +
geom_smooth(aes(col = Species), method = 'lm') +
geom_smooth(method = 'lm', col = 'black') + theme_bw()
## #############################################################################
## PCA demo to show the steps in hierarchical clustering
## #############################################################################
PC <- prcomp(iris[, 1:4])$x
dm <- dist(iris[, 1:4])
hc <- hclust(dm)
plot(hc)
for (i in 2:8) {
plot(hc)
rect.hclust(hc, k = i, border = 'red')
Sys.sleep(1)
}
library(animation)
ani.options(interval = 1)
saveGIF({
for (i in 2:8) {
plot(hc)
rect.hclust(hc, k = i, border = 'red')
}
})
library(animation)
ani.options(interval = 1)
saveGIF({
for (i in 1:8) {
plot(-PC[, 1:2],
col = cutree(hc, i),
pch = as.numeric(factor(iris$Species)) + 5)
}
})
## #############################################################################
## basic decision trees
## #############################################################################
rm(iris)
set.seed(3)
iris$rnd <- runif(150)
iris <- iris[order(iris$rnd),]
iris
iris$rnd <- NULL
train <- iris[1:100, ]
test <- iris[101:150, ]
library(rpart)
ct <- rpart(Species ~ ., data = train)
summary(ct)
plot(ct); text(ct)
library(partykit)
plot(as.party(ct))
predict(ct, newdata = test)
predict(ct, newdata = test, type = 'class')
## confusion matrix
table(test$Species,
predict(ct, newdata = test,
type = 'class'))
ct <- rpart(Species ~ ., data = train,
minsplit = 3)
## ?rpart.control
plot(ct); text(ct)
table(test$Species,
predict(ct, newdata = test,
type = 'class'))
## #############################################################################
## K-Nearest Neighbors algorithm
## #############################################################################
setDT(iris)
iris[, rnd := runif(150)]
setorder(iris, rnd)
iris
iris[, rnd := NULL]
train <- iris[1:100]
test <- iris[101:150]
library(class)
knn(train[, 1:4, with = FALSE], test[, 1:4, with = FALSE], train$Species)
res <- knn(train[, 1:4, with = FALSE], test[, 1:4, with = FALSE], train$Species)
table(test$Species, res)
res <- knn(train[, 1:4, with = FALSE], test[, 1:4, with = FALSE], train$Species, k = 2)
table(test$Species, res)
## #############################################################################
## EXERCISE: basic decision trees
## #############################################################################
## => model gender from the height/weight dataset
df <- read.csv('http://bit.ly/BudapestBI-R-csv')
library(rpart)
ct <- rpart(sex ~ heightIn + weightLb, data = df)
plot(ct); text(ct)
ct <- rpart(sex ~ heightIn + weightLb, data = df, minsplit = 1)
table(df$sex, predict(ct, type = 'class'))
## #############################################################################
## did you use test data? beware of overfitting
## #############################################################################
## random order
set.seed(7)
df <- df[order(runif(nrow(df))), ]
## create training and test datasets
train <- df[1:100, ]
test <- df[101:237, ]
## pretty impressing results!
ct <- rpart(sex ~ heightIn + weightLb, data = train, minsplit = 1)
table(train$sex, predict(ct, type = 'class'))
## but oops
table(test$sex, predict(ct, newdata = test, type = 'class'))
## with a more generalized model
ct <- rpart(sex ~ heightIn + weightLb, data = train)
table(train$sex, predict(ct, type = 'class'))
table(test$sex, predict(ct, newdata = test, type = 'class'))
## back to slides on high level overview on bagging, random forest, gbm etc
## #############################################################################
## H2O
## #############################################################################
## start and connect to H2O
library(h2o)
h2o.init()
## in a browser check http://localhost:54321
## demo data
library(hflights)
rm(hflights)
str(hflights)
## copy to H2O
hflights.hex <- as.h2o(hflights, 'hflights')
str(hflights.hex)
str(head(hflights.hex))
head(hflights.hex, 3)
summary(hflights.hex)
## check flight number: numeric?
## check on H2O web interface as well (enum vs factor)
## convert numeric to factor/enum
hflights.hex[, 'FlightNum'] <- as.factor(hflights.hex[, 'FlightNum'])
summary(hflights.hex)
## boring
for (v in c('Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'ArrTime', 'Cancelled')) {
hflights.hex[, v] <- as.factor(hflights.hex[, v])
}
summary(hflights.hex)
## feature engineering: departure time? is it OK? hour of the day?
## redo everything... just use the R script
library(data.table)
hflights <- data.table(hflights)
hflights[, hour := substr(DepTime, 1, 2)]
hflights[, .N, by = hour]
hflights[, hour := substr(DepTime, 1, nchar(DepTime) - 2)]
hflights[, hour := cut(as.numeric(hour), seq(0, 24, 4))]
hflights[, .N, by = hour]
hflights[is.na(hour)]
hflights[, .N, by = .(is.na(hour), Cancelled == 1)]
## drop columns
hflights <- hflights[, .(Month, DayofMonth, DayOfWeek, Dest, Origin,
UniqueCarrier, FlightNum, TailNum, Distance,
Cancelled = factor(Cancelled))]
## re-upload to H2O
h2o.ls()
hflights.hex <- as.h2o(as.data.frame(hflights), 'hflights')
## split the data
h2o.splitFrame(data = hflights.hex , ratios = 0.75, destination_frames = paste0('hflights', 0:1))
h2o.ls()
## build the first model
hflights.rf <- h2o.randomForest(
model_id = 'hflights_rf',
x = setdiff(names(hflights), 'Cancelled'),
y = 'Cancelled',
training_frame = 'hflights0',
validation_frame = 'hflights1')
hflights.rf
## more trees
hflights.rf <- h2o.randomForest(
model_id = 'hflights_rf500',
x = setdiff(names(hflights), 'Cancelled'),
y = 'Cancelled',
training_frame = 'hflights0',
validation_frame = 'hflights1', ntrees = 500)
## change to UI and check ROC curve & AUC
## GBM
hflights.gbm <- h2o.gbm(
x = setdiff(names(hflights), 'Cancelled'),
y = 'Cancelled',
training_frame = 'hflights0',
validation_frame = 'hflights1',
model_id = 'hflights_gbm')
## more trees should help, again !!!
hflights.gbm <- h2o.gbm(
x = setdiff(names(hflights), 'Cancelled'),
y = 'Cancelled',
training_frame = 'hflights0',
validation_frame = 'hflights1',
model_id = 'hflights_gbm2', ntrees = 550)
## but no: although higher training AUC, lower validation AUC => overfit
## more trees should help, again !!!
hflights.gbm <- h2o.gbm(
x = setdiff(names(hflights), 'Cancelled'),
y = 'Cancelled',
training_frame = 'hflights_part0',
validation_frame = 'hflights_part1',
model_id = 'hflights_gbm2', ntrees = 250, learn_rate = 0.01)
## bye
h2o.shutdown()
## #############################################################################
## intro to data.table
## #############################################################################
## introduction to data.table
install.packages('data.table')
install.packages('hflights')
library(hflights)
?hflights
str(hflights)
library(data.table)
dt <- data.table(hflights)
## dt[i]
dt[DayOfWeek == 5]
dt[DayOfWeek == 5 & Month == 3]
dt[DayOfWeek == 5 & Month == 3, ActualElapsedTime]
## dt[i, j]
dt[DayOfWeek == 5 & Month == 3, mean(ActualElapsedTime)]
dt[DayOfWeek == 5 & Month == 3,
mean(ActualElapsedTime, na.rm = TRUE)]
dt[is.na(ActualElapsedTime) & Cancelled == 0 & Diverted == 0]
## the avg flight time to LAX
dt[Dest == 'LAX', mean(AirTime, na.rm = TRUE)]
## avg flight time per dest
## dt[i, j, by = ...]
dta <- dt[, mean(AirTime, na.rm = TRUE), by = Dest]
dta
names(dta)[2] <- 'avgAirTime'
## multiple "j" expresions
dt[, .(
minAvgTime = min(AirTime, na.rm = TRUE),
avgAirTime = mean(AirTime, na.rm = TRUE),
maxAirTime = max(AirTime, na.rm = TRUE)),
by = .(Dest, Origin)]
## TODO number of cancelled flights to LAX
dt[Dest == 'LAX', sum(Cancelled)]
nrow(dt[Dest == 'LAX' & Cancelled == 1])
dt[Dest == 'LAX' & Cancelled == 1, .N]
dt[Cancelled == 1, .N, by = Dest]
## TODO sum of miles travelled to LAX
dt[Dest == 'LAX', sum(Distance)]
dt$distanceKm <- dt$Distance * 1.6
dt[, distanceKm := Distance * 1.6]
dt[Dest == 'LAX', sum(Distance) * 1.6]
dt[Dest == 'LAX', sum(distanceKm)]
## TODO percentage of flights cancelled per dest
dt[, sum(Cancelled == 1) / .N, by = Dest]
dt[, sum(Cancelled) / .N, by = Dest]
dt[, mean(Cancelled) * 100]
## TODO plot the departure and arrival delays
ggplot(dt, aes(ArrDelay, DepDelay)) + geom_point()
ggplot(dt, aes(ArrDelay, DepDelay)) + geom_hex()
## TODO plot the avg departure and arrival delay by dest:
## aggregate, then visualize
dta <- dt[, list(
arrival = mean(ArrDelay, na.rm = TRUE),
departure = mean(DepDelay, na.rm = TRUE)
), by = Dest]
ggplot(dta, aes(x = departure, y = arrival)) +
geom_text(aes(label = Dest))
## TODO visualize the avg flight time per dest
dta <- dt[, list(airtime = mean(AirTime, na.rm = TRUE)),
by = Dest]
ggplot(dta, aes(x = Dest, y = airtime)) +
geom_bar(stat = 'identity')
## heatmap => day of week + hour => N
install.packages("nycflights13")
library(nycflights13)
str(flights)
dt <- data.table(flights)
dt[, dayOfWeek := weekdays(as.Date(
paste(year, month, day, sep = '-')
))]
dt[, date := paste(year, month, day, sep = '-')]
dt[, date := as.Date(date)]
dt[, dayOfWeek := weekdays(date)]
dta <- dt[, list(count = .N), by = list(dayOfWeek, hour)]
ggplot(dta, aes(x = hour, y = dayOfWeek, fill = count)) +
geom_tile()
## long and wide tables
library(reshape2)
?dcast
?melt
dcast(dta, dayOfWeek ~ hour) # long -> wide
dta[dayOfWeek == 'Friday' & hour == 5]
dtw <- dcast(dta, hour ~ dayOfWeek)
melt(dtw, id.vars = 'hour') # wide -> long
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment