Skip to content

Instantly share code, notes, and snippets.

@Nimster
Created February 11, 2013 20:49
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Nimster/4757490 to your computer and use it in GitHub Desktop.
Save Nimster/4757490 to your computer and use it in GitHub Desktop.
An introduction to R, as presented at http://www.meetup.com/Big-Data-Israel/events/96536782/
######### Intro to R ###############
######### The Data Frame ###########
df <- data.frame(
row.names = c('HaLikud', 'Yesh Atid', 'HaAvoda', 'HaBait HaYehudi', 'Yehadut HaTora', 'Meretz', 'Shas'),
LeaderName = c('Netanyahu', 'Lapid', 'Yehimovitch', 'Bennet', 'Litzman', 'GalOn', 'Yishai'),
Category = c('Right', 'Center', 'Left', 'Right', 'Religious', 'Left', 'Religious'),
Mandates = c(31, 19, 15, 12, 7, 6, 11)
)
df
colnames(df)
rownames(df)
df[df$Category == 'Right', ]
df[df$Category == 'Right', "Mandates"]
df[df$Category == 'Right', c("Mandates", "LeaderName")]
df[df$Category == 'Right', c(3, 1)]
df[df$Category == 'Right', -2]
df[(df$Category == 'Right' & df$Mandates > 20) | (df$Category == 'Left' & df$Mandates < 15), ]
(df$Category == 'Right' & df$Mandates > 20) | (df$Category == 'Left' & df$Mandates < 15)
which( (df$Category == 'Right' & df$Mandates > 20) | (df$Category == 'Left' & df$Mandates < 15) )
df[ which( (df$Category == 'Right' & df$Mandates > 20) | (df$Category == 'Left' & df$Mandates < 15) ) , ]
df[ which( (df$Category == 'Right' & df$Mandates > 20) | (df$Category == 'Left' & df$Mandates < 15) ) , "Mandates"] <- 999
df[ df$Mandates > 500 , "Mandates"] <- NA
df[! is.na(df$Mandates), ]
df[df$Category %in% c('Right', 'Left'), ]
with(df, Mandates * 3)
Mandates # BOO; So how did this work?
# Rubyists: understand this as a ruby block (closure)
with(df, { print("HERE!")
Mandates * 3 })
# But this is unique
transform(df, remaining = 61 - Mandates, logvoters = log(22500 * Mandates))
## Factors
df$Category
## Matrices
matrix(c(1,2,3,4,5,6,7,8), nrow = 4, ncol = 2)
matrix(c(1,2,3,4,5,6,7,8), nrow = 4, ncol = 2) %*% matrix(c(1,2,3,-1,-2,-3), nrow = 2, ncol = 3)
## Vectors
1:8
c(1:4, 10:15)
rep(seq(1, 2, by = 0.2), each = 3, times = 2)
## Time Series
ts(seq(100, 300, by = 5), start = 1960, freq = 12)
## Everything is vectorized
sqrt((1:10) ** 2)
df$Large <- ifelse(df$Mandates > 10, TRUE, FALSE)
df
rownames(df) <- paste(rownames(df), df$LeaderName, sep=' BeRashut ')
df
## R is functional
f <- function(x, y) {
(x + y) / (x - y)
}
f(3, 4)
f(2:3, 4:5)
outer(1:3, 4:6, FUN=f)
?binom.test
binom.test(50, 100)
ls(binom.test(50, 100))
binom.test(50, 100)$conf.int
binom.test(50, 100)$conf.int[1]
binom.test(50, 100)$conf.int * 100
as.vector(binom.test(50, 100)$conf.int * 100)
s <- function(x, n) { as.vector(binom.test(x, n)$conf.int * n) }
s(50, 100)
s(50:55, 100) # BOO
vs <- Vectorize(s, "x")
vs(50:55, 100)
x <- seq(10, 40, by = 10)
names(x) <- x
n <- seq(100, 400, by = 100)
names(n) <- n
vs2 <- Vectorize(s, c("x", "n"))
vs2(x, 100)
vs2(50, n)
vs2(x, n) # vs(x[1], n[1]), vs(x[2], n[2]), vs(x[3], n[3]), vs(x[4], n[4])
# Like this? explore mapply, ddply (plyr package), etc.
## R integrates well
install.packages('gdata')
library(gdata)
xl <- read.xls('~/Downloads/eurostat_dirty.xlsx', sheet=1, na.strings=':', stringsAsFactors = F)
xl
xl <- xl[-(40:nrow(xl)), ]
rownames(xl) <- xl[, 1]
xl <- xl[, seq(2, ncol(xl), by = 2)]
xl <- xl[, -1]
colnames(xl) <- paste("Y", xl[2, ], sep='')
xl <- xl[-(1:4), ]
colSums(xl)
colSums(xl, na.rm = T)
xl <- na.omit(xl)
apply(xl, MARGIN=2, max)
apply(xl, MARGIN=2, mean)
summary(xl)
## Advanced data processing
xl$Bed_Category <- cut(xl$Y2011, c(0, 30, 100, Inf), labels = c("Little", "Medium", "Lots"))
?tapply
tapply(xl$Y2011, xl$Bed_Category, FUN = mean)
x <- c(rep(1:3, each = 3, times = 2))
x
rle(x)
# Where to advertise? A multi-armed bandit approach.
sample(rownames(xl), 3, replace = T, xl$Y2011)
cut(xl$Y2001, 3) # Generate cut points automatically. Oh-oh
quantile(xl$Y2001, probs = c(0, 0.25, 0.75, 1))
xl$Old_Bed_Category <- cut(xl$Y2001,
quantile(xl$Y2011, probs = c(0, 0.25, 0.75, 1)),
labels = c('Low', 'Medium', 'High'),
right = T, include.lowest = T) # Include both ends of the range
xl
# in R, the questions is often "What's the function that does *THAT*?"
ftable(xl[, c('Bed_Category', 'Old_Bed_Category')])
## Riddle: How do I find the problematic 5?
## ...
## ...
## ...
rownames(xl[xl$Bed_Category == 'Little' & xl$Old_Bed_Category == 'Low', ])
## More data plays
order(xl$Y2011) # Huh?
xl[order(xl$Y2011), ] # ahhhh
rank(xl$Y2011) # Inversed perm!
## Stats & Probability
runif(5, 0, 3)
rbinom(1, 100, 0.5)
hist(rbinom(10, 100, 0.5))
hist(rbinom(100, 100, 0.5))
hist(rbinom(1000, 100, 0.5)) # CLT!
hist(rnorm(1000, 1.5, 1))
library(ggplot2)
l <- rnorm(1000, 1.5, 1)
p <- qplot(l, geom = 'histogram')
p
p + xlab("Coffee breaks per day")
## More cool IO
library(XML)
theurl <- "http://en.wikipedia.org/wiki/List_of_tallest_structures_in_the_world"
tables <- readHTMLTable(theurl)
n.rows <- unlist(lapply(tables, function(t) dim(t)[1]))
tbl <- tables[[which.max(n.rows)]]
tbl
tbl <- tbl[, 1:6]
colnames(tbl)[c(2, 4, 5)] <- c('Height', 'Type', 'Use')
tbl$Height <- as.numeric(gsub('\\s.*', '', tbl$Height))
tbl$Year[!grepl("\\d", tbl$Year)] <- NA
tbl$Year <- as.numeric(sub("\\D.*", '', tbl$Year))
qplot(data = tbl, x = Year, y = Height, color = Country)
qplot(data = na.omit(tbl), x = Type, geom="bar")
qplot(data = na.omit(tbl), x = Type, geom="bar", fill = Country)
## Some linear models
mtcars
ggplot(data=mtcars, aes(factor(cyl), fill=factor(gear))) + geom_bar(position="dodge")
lm(data = mtcars, mpg ~ hp)
l <- lm(data = mtcars, mpg ~ hp)
summary(l) # Look at summary(l)$r.squared
qplot(data = mtcars, x = hp, y = mpg, geom="point")
qplot(data = mtcars, x = hp, y = mpg, geom="point") + geom_smooth(method = 'lm')
predict(l)
predict(l, newdata = data.frame(hp = seq(50, 300, by = 25)))
l <- lm(data = mtcars, mpg ~ hp + I(hp^2))
summary(l)
qplot(data = mtcars, hp, mpg, geom="point") + geom_line(data = data.frame(hp=seq(50,350, by=3), mpg=predict(l, data.frame(hp=seq(50, 350, by=3)))), aes(x=hp, y=mpg))
l <- lm(data = mtcars, mpg ~ hp + I(hp^2) + I(hp^3) + I(hp^4) + I(hp^5) + I(hp^6) + I(hp^7) + I(hp^8) + I(hp^9) + I(hp^10))
qplot(data = mtcars, hp, mpg, geom="point") + geom_line(data = data.frame(hp=seq(50,335, by=3), mpg=predict(l, data.frame(hp=seq(50, 335, by=3)))), aes(x=hp, y=mpg))
l <- lm(data = mtcars, mpg ~ hp + wt + cyl)
summary(l)
qplot(data = mtcars, x = wt, y = mpg, geom="point") + geom_smooth(method = 'lm')
## ... Incidentally...
merge(tbl, xl, by.x='Country', by.y = 'row.names', all = F)
# Pivots
library(reshape)
cast(df, Category ~ ., value='Mandates', fun.aggregate = sum, na.rm = T)
cast(df, Category + Large ~ ., value='Mandates', fun.aggregate = length)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment