Created
November 14, 2017 17:49
-
-
Save daroczig/8fa7f998e8d2267da360462600d26a93 to your computer and use it in GitHub Desktop.
Példakódok a Budapest BI Fórum R workshopjáról: http://budapestbiforum.hu/2017/hu/workshopok/bevezetes-az-r-statisztikai-programozasi-nyelvbe-2/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## intro slides: http://bit.ly/r-intro-slide | |
## basic operations | |
1 + 3 | |
3*2 | |
3^2 | |
## constants | |
pi | |
"pi" | |
'pi' | |
letters | |
LETTERS | |
month.name | |
month.abb | |
## help | |
?pi | |
apropos('month') | |
apropos('cor') | |
## variables | |
x = 4 | |
x <- 4 | |
x | |
x / 2 | |
x^2 | |
x^0.5 | |
## functions | |
sqrt(x) | |
sqrt(x = x) | |
sin(pi / 2) | |
round(pi) | |
ceiling(pi) | |
floor(pi) | |
## custom functions | |
f <- function(x) 2 * x + 1 | |
f(5) | |
f(pi) | |
## vectors | |
c(1, 2, 3, 4, 5) | |
1:5 | |
seq(1, 5) | |
?seq | |
seq(1, 5, 0.1) | |
x <- seq(1, 5, 0.1) | |
plot(x, f(x)) | |
plot(x, f(x), type = 'l') | |
plot(x, f(x), type = 'l', xlab = '', main = '2x+1') | |
grid() | |
## TODO plot 1 period of sine curve | |
?sin | |
f <- function(x) sin(x) | |
x <- seq(0, pi * 2, 0.1) | |
plot(x, sin(x), type = 'l') | |
## a simpler way for plotting functions | |
curve(f) | |
?curve | |
curve(2*x + 1, from = 0, to = 50) | |
curve(x + 1, add = TRUE, col = 'red') | |
## TODO plot 1 period of sine curve | |
curve(sin, from = 0, to = 2*pi) | |
## TODO Brownian motion in 1D | |
runif(1) | |
runif(5) | |
## random numbers from uniform distribution | |
round(runif(5)) | |
round(runif(5))*2 - 1 | |
cumsum(round(runif(25))*2 - 1) | |
plot(cumsum(round(runif(25))*2 - 1), type = 's') | |
set.seed(42) | |
plot(cumsum(round(runif(25))*2 - 1), type = 's') | |
for (i in 2:6) { | |
lines(cumsum(round(runif(25))*2 - 1), type = 's', col = i) | |
} | |
## pro tipp #1 | |
plot(c(1, 25), c(-25, 25), type = 'n') | |
plot(NULL, NULL, xlim = c(1, 25), ylim = c(-25, 25)) | |
## pro tipp #2 | |
sample(c(-1, 1), 25, replace = TRUE) | |
################################################################################ | |
## custom vectors -> combine values into vector | |
h <- c(174, 170, 160) | |
h | |
(w <- c(90, 80, 70)) | |
plot(h, w) | |
plot(h, w, main = "Demo", xlab = 'Height', ylab = 'Weight') | |
cor(w, h) | |
lm(w ~ h) # formula notation | |
fit <- lm(w ~ h) | |
summary(fit) | |
plot(h, w, main = "Demo", xlab = 'Height', ylab = 'Weight') | |
abline(fit, col = 'red') | |
## some basic modeling | |
predict(fit, list(h = c(56, 104))) | |
56 * fit$coefficients[2] + fit$coefficients[1] | |
56 * 1.3462 + -146.1538 | |
## why? | |
plot(h, w, main = "Demo", xlab = 'Height', ylab = 'Weight', | |
xlim = c(0, max(h)), | |
ylim = c(0, max(w))) | |
abline(fit, col = 'red') | |
## polynomial model | |
fit <- lm(w ~ poly(h, 2, raw = TRUE)) | |
fit <- lm(w ~ h + I(h^2)) | |
fit | |
2824 + 174 * -34.3571 + (174^2)*0.1071 | |
plot(h, w) | |
lines(h, predict(fit), col = 'red') | |
predict(fit, data.frame(h = c(56, 104))) | |
plot(0:200, predict(fit, list(h = 0:200)), type = 'l') | |
abline(v = c(160, 174), col = 'red') | |
## data.frame | |
df <- data.frame(weight = w, height = h) | |
df | |
str(df) | |
df$weight | |
?'$' | |
plot(df) | |
cor(df) | |
lm(weight ~ height, data = df) | |
str(df) | |
df[i, j] | |
df[1, ] | |
df[, 1] | |
df[3, 2] | |
df$weight | |
df$weight[2] | |
## Body Mass Index (BMI) is a person's weight in kilograms divided by the square of height in meters. | |
df$height <- df$height/100 | |
df | |
df$bmi <- df$weight / df$height^2 | |
df | |
df$bmi <- df$weight / (df$height/100)^2 | |
## import data | |
df <- read.csv('http://bit.ly/heightweight') | |
str(df) | |
## TODO | |
df$height <- df$heightIn * 2.54 | |
df$weight <- df$weightLb * 0.45 | |
str(df) | |
df$heightIn <- df$weightLb <- NULL | |
str(df) | |
## descriptive stats | |
min(df$ageYear) | |
max(df$ageYear) | |
range(df$ageYear) | |
sum(df$weightLb) | |
length(df$weight) | |
nrow(df) | |
ncol(df) | |
dim(df) | |
## TODO plot and analyze association between height and weight | |
plot(df$height, df$weight) | |
abline(lm(weight ~ height, data = df), col = 'red') | |
## TODO compute and plot BMI index | |
df$bmi <- df$weight / (df$height/100)^2 | |
hist(df$bmi) | |
## exploratory data analysis | |
pairs(df) | |
library(GGally) | |
ggpairs(df) | |
library(pairsD3) | |
pairsD3(df) | |
## statistical test: t-test, ANOVA | |
t.test(height ~ sex, data = df) | |
aov(height ~ sex, data = df) | |
summary(aov(height ~ sex, data = df)) | |
summary(aov(weight ~ sex, data = df)) | |
## Post hoc tests => Tukey Honest Significant Differences | |
TukeyHSD(aov(height ~ sex, data = df)) | |
TukeyHSD(aov(weight ~ sex, data = df)) | |
## ############################################################################# | |
## Wilkinson: The Grammar of Graphics | |
## Wickham: ggplot2 -> layer by layer: | |
## - coordinate system, | |
## - aesthetics (labels) | |
## - scales | |
## - geometries | |
## - shapes | |
## - statistics | |
## - grid, color theme etc. + layers | |
## http://www.cookbook-r.com/Graphs/ | |
## ############################################################################# | |
library(ggplot2) | |
?diamonds | |
## barplot | |
ggplot(diamonds, aes(x = cut)) + geom_bar() | |
ggplot(diamonds, aes(x = cut)) + geom_bar() + theme_bw() | |
ggplot(diamonds, aes(x = cut)) + | |
geom_bar(colour = "darkgreen", fill = "white") + | |
theme_bw() | |
p <- ggplot(diamonds, aes(x = cut)) + | |
geom_bar(colour = "darkgreen", fill = "white") + theme_bw() | |
p | |
## coord transformations | |
library(scales) | |
p + scale_y_log10() | |
p + scale_y_sqrt() | |
p + scale_y_reverse() | |
p + coord_flip() | |
## facet | |
p + facet_wrap( ~ clarity) | |
p + facet_grid(color ~ clarity) | |
## stacked, clustered bar charts | |
p <- ggplot(diamonds, aes(x = color, fill = cut)) | |
p + geom_bar() | |
p + geom_bar(position = "fill") | |
p + geom_bar(position = "dodge") | |
## histogram | |
ggplot(diamonds, aes(x = price)) + geom_bar(binwidth = 100) | |
ggplot(diamonds, aes(x = price)) + geom_bar(binwidth = 1000) | |
ggplot(diamonds, aes(x = price)) + geom_bar(binwidth = 10000) | |
ggplot(diamonds, aes(price, fill = cut)) + geom_density(alpha = 0.2) + theme_bw() | |
## scatterplot | |
p <- ggplot(diamonds, aes(x = carat, y = price)) + geom_point() | |
library(hexbin) | |
p <- ggplot(diamonds, aes(x = carat, y = price)) + geom_hex() | |
## adding layers | |
p | |
p <- p + geom_smooth() | |
p | |
p + geom_smooth(method = "lm", se = FALSE, color = 'red') | |
ggplot(diamonds, aes(x = carat, y = price, colour = cut)) + geom_point() + geom_smooth() | |
## themes | |
library(ggthemes) | |
p + theme_economist() + scale_colour_economist() | |
p + theme_stata() + scale_colour_stata() | |
p + theme_excel() + scale_colour_excel() | |
p + theme_wsj() + scale_colour_wsj("colors6", "") | |
p + theme_gdocs() + scale_colour_gdocs() | |
library(ggthemr) | |
## Further reading at http://www.cookbook-r.com/Graphs/ | |
## TODO on mtcars | |
## * number of carburators | |
ggplot(mtcars, aes(x = carb)) + geom_bar() | |
## * hp | |
ggplot(mtcars, aes(x = hp)) + geom_histogram() | |
## * barplot of carburators per am | |
ggplot(mtcars, aes(x = carb)) + geom_bar() + facet_grid(. ~ am) | |
## stacked bar chart | |
ggplot(mtcars, aes(x = carb, fill = factor(am))) + geom_bar() | |
## grouped/dodged | |
ggplot(mtcars, aes(x = carb, fill = factor(am))) + geom_bar(position = 'dodge') | |
## * boxplot of hp by carb | |
ggplot(mtcars, aes(x = factor(am), y = hp)) + geom_boxplot() | |
## * hp and wt by carb | |
ggplot(mtcars, aes(x = hp, y = wt)) + geom_point() + facet_grid(. ~ carb) | |
## * hp and wt by carb regression | |
ggplot(mtcars, aes(x = hp, y = wt)) + geom_point() + facet_grid(. ~ carb) + geom_smooth() | |
## ############################################################################# | |
## PCA demo on image processing | |
## ############################################################################# | |
## http://www.nasa.gov/images/content/694811main_pia16225-43_full.jpg | |
## http://bit.ly/BudapestBI-R-img | |
download.file('http://bit.ly/BudapestBI-R-img', 'image.jpg') | |
library(jpeg) | |
img <- readJPEG('image.jpg') | |
str(img) | |
dim(img) | |
h <- dim(img)[1] | |
w <- dim(img)[2] | |
h;w | |
img1d <- matrix(img, h * w) | |
str(img1d) | |
pca <- prcomp(img1d) | |
pca | |
summary(pca) | |
image(matrix(pca$x[, 1], h)) | |
image(matrix(pca$x[, 2], h)) | |
image(matrix(pca$x[, 2], h), col = gray.colors(100)) | |
image(matrix(pca$x[, 3], h)) | |
## ############################################################################# | |
## MDS | |
## ############################################################################# | |
## distance between 40 Hungarian cities -> 2D scatterplot | |
download.file('http://bit.ly/CEU-R-distances', 'cities.xls') | |
library(readxl) | |
cities <- read_excel('cities.xls') | |
str(cities) | |
## get rid of 1st column and last row (metadata) | |
cities <- cities[, -1] | |
cities <- cities[-nrow(cities), ] | |
mds <- cmdscale(as.dist(cities)) | |
mds | |
plot(mds) | |
text(mds[, 1], mds[, 2], names(cities)) | |
## flipping both x and y axis | |
mds <- -mds | |
plot(mds) | |
text(mds[, 1], mds[, 2], names(cities)) | |
## distance between European cities -> 2D scatterplot on ggplot | |
mds <- cmdscale(eurodist) | |
mds | |
mds <- as.data.frame(mds) | |
library(ggplot2) | |
ggplot(mds, aes(V1, -V2, label = rownames(mds))) + geom_text() | |
## non-geo stuff | |
str(mtcars) | |
?mtcars | |
mds <- cmdscale(dist(mtcars)) | |
plot(mds) | |
text(mds[, 1], mds[, 2], rownames(mtcars)) | |
mds <- as.data.frame(mds) | |
library(ggplot2) | |
ggplot(mds, aes(V1, -V2, label = rownames(mtcars))) + | |
geom_text() + theme_bw() | |
library(ggrepel) | |
ggplot(mds, aes(V1, -V2, label = rownames(mtcars))) + | |
geom_text_repel() + theme_bw() | |
## ############################################################################# | |
## feature selection - same in 2D | |
## ############################################################################# | |
str(iris) | |
plot(iris$Sepal.Length, iris$Sepal.Width) | |
fit <- lm(Sepal.Width ~ Sepal.Length, data = iris) | |
fit | |
summary(fit) | |
plot(iris$Sepal.Length, iris$Sepal.Width) | |
abline(fit, col = 'red', lwd = 5) | |
## TODO improve the model | |
summary(lm(Sepal.Width ~ Sepal.Length + Species, | |
data = iris)) | |
library(ggplot2) | |
ggplot(iris, aes(Sepal.Length, Sepal.Width, | |
col = Species)) + | |
geom_point() + geom_smooth(method = 'lm') | |
ggplot(iris, aes(Sepal.Length, Sepal.Width)) + | |
geom_point(aes(col = Species)) + | |
geom_smooth(aes(col = Species), method = 'lm') + | |
geom_smooth(method = 'lm', col = 'black') + theme_bw() | |
## ############################################################################# | |
## PCA demo to show the steps in hierarchical clustering | |
## ############################################################################# | |
PC <- prcomp(iris[, 1:4])$x | |
dm <- dist(iris[, 1:4]) | |
hc <- hclust(dm) | |
plot(hc) | |
for (i in 2:8) { | |
plot(hc) | |
rect.hclust(hc, k = i, border = 'red') | |
Sys.sleep(1) | |
} | |
library(animation) | |
ani.options(interval = 1) | |
saveGIF({ | |
for (i in 2:8) { | |
plot(hc) | |
rect.hclust(hc, k = i, border = 'red') | |
} | |
}) | |
library(animation) | |
ani.options(interval = 1) | |
saveGIF({ | |
for (i in 1:8) { | |
plot(-PC[, 1:2], | |
col = cutree(hc, i), | |
pch = as.numeric(factor(iris$Species)) + 5) | |
} | |
}) | |
## ############################################################################# | |
## basic decision trees | |
## ############################################################################# | |
rm(iris) | |
set.seed(3) | |
iris$rnd <- runif(150) | |
iris <- iris[order(iris$rnd),] | |
iris | |
iris$rnd <- NULL | |
train <- iris[1:100, ] | |
test <- iris[101:150, ] | |
library(rpart) | |
ct <- rpart(Species ~ ., data = train) | |
summary(ct) | |
plot(ct); text(ct) | |
library(partykit) | |
plot(as.party(ct)) | |
predict(ct, newdata = test) | |
predict(ct, newdata = test, type = 'class') | |
## confusion matrix | |
table(test$Species, | |
predict(ct, newdata = test, | |
type = 'class')) | |
ct <- rpart(Species ~ ., data = train, | |
minsplit = 3) | |
## ?rpart.control | |
plot(ct); text(ct) | |
table(test$Species, | |
predict(ct, newdata = test, | |
type = 'class')) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment