Nimster/intro_to_R

## intro_to_R
######### Intro to R ###############

######### The Data Frame ###########
df <- data.frame(
  row.names = c('HaLikud', 'Yesh Atid', 'HaAvoda', 'HaBait HaYehudi', 'Yehadut HaTora', 'Meretz', 'Shas'),
  LeaderName = c('Netanyahu', 'Lapid', 'Yehimovitch', 'Bennet', 'Litzman', 'GalOn', 'Yishai'),
  Category = c('Right', 'Center',  'Left', 'Right', 'Religious', 'Left', 'Religious'),
  Mandates = c(31, 19, 15, 12, 7, 6, 11)
  )

df

colnames(df)
rownames(df)

df[df$Category == 'Right', ]
df[df$Category == 'Right', "Mandates"]
df[df$Category == 'Right', c("Mandates", "LeaderName")]
df[df$Category == 'Right', c(3, 1)]
df[df$Category == 'Right', -2]

df[(df$Category == 'Right' & df$Mandates > 20) | (df$Category == 'Left' & df$Mandates < 15), ]

(df$Category == 'Right' & df$Mandates > 20) | (df$Category == 'Left' & df$Mandates < 15)

which( (df$Category == 'Right' & df$Mandates > 20) | (df$Category == 'Left' & df$Mandates < 15) )

df[ which( (df$Category == 'Right' & df$Mandates > 20) | (df$Category == 'Left' & df$Mandates < 15) ) , ]

df[ which( (df$Category == 'Right' & df$Mandates > 20) | (df$Category == 'Left' & df$Mandates < 15) ) , "Mandates"] <- 999

df[ df$Mandates > 500 , "Mandates"] <- NA

df[! is.na(df$Mandates), ]

df[df$Category %in% c('Right', 'Left'), ]

with(df, Mandates * 3)
Mandates # BOO; So how did this work?
# Rubyists: understand this as a ruby block (closure)
with(df, { print("HERE!")
           Mandates * 3 })
# But this is unique
transform(df, remaining = 61 - Mandates, logvoters = log(22500 * Mandates))

## Factors
df$Category

## Matrices
matrix(c(1,2,3,4,5,6,7,8), nrow = 4, ncol = 2)

matrix(c(1,2,3,4,5,6,7,8), nrow = 4, ncol = 2) %*% matrix(c(1,2,3,-1,-2,-3), nrow = 2, ncol = 3)

## Vectors
1:8

c(1:4, 10:15)

rep(seq(1, 2, by = 0.2), each = 3, times = 2)

## Time Series
ts(seq(100, 300, by = 5), start = 1960, freq = 12)

## Everything is vectorized
sqrt((1:10) ** 2)
df$Large <- ifelse(df$Mandates > 10, TRUE, FALSE)
df
rownames(df) <- paste(rownames(df), df$LeaderName, sep=' BeRashut ')
df

## R is functional
f <- function(x, y) {
  (x + y) / (x - y)
}

f(3, 4)
f(2:3, 4:5)

outer(1:3, 4:6, FUN=f)

?binom.test
binom.test(50, 100)
ls(binom.test(50, 100))
binom.test(50, 100)$conf.int
binom.test(50, 100)$conf.int[1]
binom.test(50, 100)$conf.int * 100
as.vector(binom.test(50, 100)$conf.int * 100)

s <- function(x, n) { as.vector(binom.test(x, n)$conf.int * n) }
s(50, 100)
s(50:55, 100) # BOO
vs <- Vectorize(s, "x")
vs(50:55, 100)
x <- seq(10, 40, by = 10)
names(x) <- x
n <- seq(100, 400, by = 100)
names(n) <- n
vs2 <- Vectorize(s, c("x", "n"))
vs2(x, 100)
vs2(50, n)
vs2(x, n) # vs(x[1], n[1]), vs(x[2], n[2]), vs(x[3], n[3]), vs(x[4], n[4])

# Like this? explore mapply, ddply (plyr package), etc.

## R integrates well
install.packages('gdata')
library(gdata)

xl <- read.xls('~/Downloads/eurostat_dirty.xlsx', sheet=1, na.strings=':', stringsAsFactors = F)
xl
xl <- xl[-(40:nrow(xl)), ]
rownames(xl) <- xl[, 1]
xl <- xl[, seq(2, ncol(xl), by = 2)]
xl <- xl[, -1]
colnames(xl) <- paste("Y", xl[2, ], sep='')
xl <- xl[-(1:4), ]

colSums(xl)
colSums(xl, na.rm = T)
xl <- na.omit(xl)
apply(xl, MARGIN=2, max)
apply(xl, MARGIN=2, mean)
summary(xl)

## Advanced data processing
xl$Bed_Category <- cut(xl$Y2011, c(0, 30, 100, Inf), labels = c("Little", "Medium", "Lots"))
?tapply
tapply(xl$Y2011, xl$Bed_Category, FUN = mean)

x <- c(rep(1:3, each = 3, times = 2))
x
rle(x)

# Where to advertise? A multi-armed bandit approach.
sample(rownames(xl), 3, replace = T, xl$Y2011)

cut(xl$Y2001, 3)  # Generate cut points automatically. Oh-oh
quantile(xl$Y2001, probs = c(0, 0.25, 0.75, 1))
xl$Old_Bed_Category <- cut(xl$Y2001,
    quantile(xl$Y2011, probs = c(0, 0.25, 0.75, 1)),
    labels = c('Low', 'Medium', 'High'),
    right = T, include.lowest = T) # Include both ends of the range

xl

# in R, the questions is often "What's the function that does *THAT*?"
ftable(xl[, c('Bed_Category', 'Old_Bed_Category')])

## Riddle: How do I find the problematic 5?
## ...
## ...
## ...
rownames(xl[xl$Bed_Category == 'Little' & xl$Old_Bed_Category == 'Low', ])

## More data plays
order(xl$Y2011) # Huh?
xl[order(xl$Y2011), ] # ahhhh
rank(xl$Y2011) # Inversed perm!

## Stats & Probability
runif(5, 0, 3)
rbinom(1, 100, 0.5)
hist(rbinom(10, 100, 0.5))
hist(rbinom(100, 100, 0.5))
hist(rbinom(1000, 100, 0.5)) # CLT!
hist(rnorm(1000, 1.5, 1))

library(ggplot2)
l <- rnorm(1000, 1.5, 1)
p <- qplot(l, geom = 'histogram')
p
p + xlab("Coffee breaks per day")

## More cool IO
library(XML)
theurl <- "http://en.wikipedia.org/wiki/List_of_tallest_structures_in_the_world"
tables <- readHTMLTable(theurl)
n.rows <- unlist(lapply(tables, function(t) dim(t)[1]))
tbl <- tables[[which.max(n.rows)]]
tbl
tbl <- tbl[, 1:6]
colnames(tbl)[c(2, 4, 5)] <- c('Height', 'Type', 'Use')
tbl$Height <- as.numeric(gsub('\\s.*', '', tbl$Height))
tbl$Year[!grepl("\\d", tbl$Year)] <- NA
tbl$Year <- as.numeric(sub("\\D.*", '', tbl$Year))

qplot(data = tbl, x = Year, y = Height, color = Country)

qplot(data = na.omit(tbl), x = Type, geom="bar")
qplot(data = na.omit(tbl), x = Type, geom="bar", fill = Country)

## Some linear models
mtcars

ggplot(data=mtcars, aes(factor(cyl), fill=factor(gear))) + geom_bar(position="dodge")

lm(data = mtcars, mpg ~ hp)
l <- lm(data = mtcars, mpg ~ hp)
summary(l) # Look at summary(l)$r.squared
qplot(data = mtcars, x = hp, y = mpg, geom="point")
qplot(data = mtcars, x = hp, y = mpg, geom="point") + geom_smooth(method = 'lm')

predict(l)
predict(l, newdata = data.frame(hp = seq(50, 300, by = 25)))

l <- lm(data = mtcars, mpg ~ hp + I(hp^2))
summary(l)
qplot(data = mtcars, hp, mpg, geom="point") + geom_line(data = data.frame(hp=seq(50,350, by=3), mpg=predict(l, data.frame(hp=seq(50, 350, by=3)))), aes(x=hp, y=mpg))

l <- lm(data = mtcars, mpg ~ hp + I(hp^2) + I(hp^3) + I(hp^4) + I(hp^5) + I(hp^6) + I(hp^7) + I(hp^8) + I(hp^9) + I(hp^10))
qplot(data = mtcars, hp, mpg, geom="point") + geom_line(data = data.frame(hp=seq(50,335, by=3), mpg=predict(l, data.frame(hp=seq(50, 335, by=3)))), aes(x=hp, y=mpg))

l <- lm(data = mtcars, mpg ~ hp + wt + cyl)
summary(l)
qplot(data = mtcars, x = wt, y = mpg, geom="point") + geom_smooth(method = 'lm')

## ... Incidentally...
merge(tbl, xl, by.x='Country', by.y = 'row.names', all = F)
# Pivots
library(reshape)
cast(df, Category ~ ., value='Mandates', fun.aggregate = sum, na.rm = T)
cast(df, Category + Large ~ ., value='Mandates', fun.aggregate = length)
	######### Intro to R ###############

	######### The Data Frame ###########
	df <- data.frame(
	row.names = c('HaLikud', 'Yesh Atid', 'HaAvoda', 'HaBait HaYehudi', 'Yehadut HaTora', 'Meretz', 'Shas'),
	LeaderName = c('Netanyahu', 'Lapid', 'Yehimovitch', 'Bennet', 'Litzman', 'GalOn', 'Yishai'),
	Category = c('Right', 'Center', 'Left', 'Right', 'Religious', 'Left', 'Religious'),
	Mandates = c(31, 19, 15, 12, 7, 6, 11)
	)

	df

	colnames(df)
	rownames(df)

	df[df$Category == 'Right', ]
	df[df$Category == 'Right', "Mandates"]
	df[df$Category == 'Right', c("Mandates", "LeaderName")]
	df[df$Category == 'Right', c(3, 1)]
	df[df$Category == 'Right', -2]

	df[(df$Category == 'Right' & df$Mandates > 20) \| (df$Category == 'Left' & df$Mandates < 15), ]

	(df$Category == 'Right' & df$Mandates > 20) \| (df$Category == 'Left' & df$Mandates < 15)

	which( (df$Category == 'Right' & df$Mandates > 20) \| (df$Category == 'Left' & df$Mandates < 15) )

	df[ which( (df$Category == 'Right' & df$Mandates > 20) \| (df$Category == 'Left' & df$Mandates < 15) ) , ]

	df[ which( (df$Category == 'Right' & df$Mandates > 20) \| (df$Category == 'Left' & df$Mandates < 15) ) , "Mandates"] <- 999

	df[ df$Mandates > 500 , "Mandates"] <- NA

	df[! is.na(df$Mandates), ]

	df[df$Category %in% c('Right', 'Left'), ]

	with(df, Mandates * 3)
	Mandates # BOO; So how did this work?
	# Rubyists: understand this as a ruby block (closure)
	with(df, { print("HERE!")
	Mandates * 3 })
	# But this is unique
	transform(df, remaining = 61 - Mandates, logvoters = log(22500 * Mandates))

	## Factors
	df$Category

	## Matrices
	matrix(c(1,2,3,4,5,6,7,8), nrow = 4, ncol = 2)

	matrix(c(1,2,3,4,5,6,7,8), nrow = 4, ncol = 2) %*% matrix(c(1,2,3,-1,-2,-3), nrow = 2, ncol = 3)

	## Vectors
	1:8

	c(1:4, 10:15)

	rep(seq(1, 2, by = 0.2), each = 3, times = 2)

	## Time Series
	ts(seq(100, 300, by = 5), start = 1960, freq = 12)

	## Everything is vectorized
	sqrt((1:10) ** 2)
	df$Large <- ifelse(df$Mandates > 10, TRUE, FALSE)
	df
	rownames(df) <- paste(rownames(df), df$LeaderName, sep=' BeRashut ')
	df

	## R is functional
	f <- function(x, y) {
	(x + y) / (x - y)
	}

	f(3, 4)
	f(2:3, 4:5)

	outer(1:3, 4:6, FUN=f)

	?binom.test
	binom.test(50, 100)
	ls(binom.test(50, 100))
	binom.test(50, 100)$conf.int
	binom.test(50, 100)$conf.int[1]
	binom.test(50, 100)$conf.int * 100
	as.vector(binom.test(50, 100)$conf.int * 100)

	s <- function(x, n) { as.vector(binom.test(x, n)$conf.int * n) }
	s(50, 100)
	s(50:55, 100) # BOO
	vs <- Vectorize(s, "x")
	vs(50:55, 100)
	x <- seq(10, 40, by = 10)
	names(x) <- x
	n <- seq(100, 400, by = 100)
	names(n) <- n
	vs2 <- Vectorize(s, c("x", "n"))
	vs2(x, 100)
	vs2(50, n)
	vs2(x, n) # vs(x[1], n[1]), vs(x[2], n[2]), vs(x[3], n[3]), vs(x[4], n[4])

	# Like this? explore mapply, ddply (plyr package), etc.

	## R integrates well
	install.packages('gdata')
	library(gdata)

	xl <- read.xls('~/Downloads/eurostat_dirty.xlsx', sheet=1, na.strings=':', stringsAsFactors = F)
	xl
	xl <- xl[-(40:nrow(xl)), ]
	rownames(xl) <- xl[, 1]
	xl <- xl[, seq(2, ncol(xl), by = 2)]
	xl <- xl[, -1]
	colnames(xl) <- paste("Y", xl[2, ], sep='')
	xl <- xl[-(1:4), ]

	colSums(xl)
	colSums(xl, na.rm = T)
	xl <- na.omit(xl)
	apply(xl, MARGIN=2, max)
	apply(xl, MARGIN=2, mean)
	summary(xl)

	## Advanced data processing
	xl$Bed_Category <- cut(xl$Y2011, c(0, 30, 100, Inf), labels = c("Little", "Medium", "Lots"))
	?tapply
	tapply(xl$Y2011, xl$Bed_Category, FUN = mean)

	x <- c(rep(1:3, each = 3, times = 2))
	x
	rle(x)

	# Where to advertise? A multi-armed bandit approach.
	sample(rownames(xl), 3, replace = T, xl$Y2011)

	cut(xl$Y2001, 3) # Generate cut points automatically. Oh-oh
	quantile(xl$Y2001, probs = c(0, 0.25, 0.75, 1))
	xl$Old_Bed_Category <- cut(xl$Y2001,
	quantile(xl$Y2011, probs = c(0, 0.25, 0.75, 1)),
	labels = c('Low', 'Medium', 'High'),
	right = T, include.lowest = T) # Include both ends of the range

	xl

	# in R, the questions is often "What's the function that does THAT?"
	ftable(xl[, c('Bed_Category', 'Old_Bed_Category')])

	## Riddle: How do I find the problematic 5?
	## ...
	## ...
	## ...
	rownames(xl[xl$Bed_Category == 'Little' & xl$Old_Bed_Category == 'Low', ])

	## More data plays
	order(xl$Y2011) # Huh?
	xl[order(xl$Y2011), ] # ahhhh
	rank(xl$Y2011) # Inversed perm!

	## Stats & Probability
	runif(5, 0, 3)
	rbinom(1, 100, 0.5)
	hist(rbinom(10, 100, 0.5))
	hist(rbinom(100, 100, 0.5))
	hist(rbinom(1000, 100, 0.5)) # CLT!
	hist(rnorm(1000, 1.5, 1))

	library(ggplot2)
	l <- rnorm(1000, 1.5, 1)
	p <- qplot(l, geom = 'histogram')
	p
	p + xlab("Coffee breaks per day")

	## More cool IO
	library(XML)
	theurl <- "http://en.wikipedia.org/wiki/List_of_tallest_structures_in_the_world"
	tables <- readHTMLTable(theurl)
	n.rows <- unlist(lapply(tables, function(t) dim(t)[1]))
	tbl <- tables[[which.max(n.rows)]]
	tbl
	tbl <- tbl[, 1:6]
	colnames(tbl)[c(2, 4, 5)] <- c('Height', 'Type', 'Use')
	tbl$Height <- as.numeric(gsub('\\s.*', '', tbl$Height))
	tbl$Year[!grepl("\\d", tbl$Year)] <- NA
	tbl$Year <- as.numeric(sub("\\D.*", '', tbl$Year))

	qplot(data = tbl, x = Year, y = Height, color = Country)

	qplot(data = na.omit(tbl), x = Type, geom="bar")
	qplot(data = na.omit(tbl), x = Type, geom="bar", fill = Country)

	## Some linear models
	mtcars

	ggplot(data=mtcars, aes(factor(cyl), fill=factor(gear))) + geom_bar(position="dodge")

	lm(data = mtcars, mpg ~ hp)
	l <- lm(data = mtcars, mpg ~ hp)
	summary(l) # Look at summary(l)$r.squared
	qplot(data = mtcars, x = hp, y = mpg, geom="point")
	qplot(data = mtcars, x = hp, y = mpg, geom="point") + geom_smooth(method = 'lm')

	predict(l)
	predict(l, newdata = data.frame(hp = seq(50, 300, by = 25)))

	l <- lm(data = mtcars, mpg ~ hp + I(hp^2))
	summary(l)
	qplot(data = mtcars, hp, mpg, geom="point") + geom_line(data = data.frame(hp=seq(50,350, by=3), mpg=predict(l, data.frame(hp=seq(50, 350, by=3)))), aes(x=hp, y=mpg))

	l <- lm(data = mtcars, mpg ~ hp + I(hp^2) + I(hp^3) + I(hp^4) + I(hp^5) + I(hp^6) + I(hp^7) + I(hp^8) + I(hp^9) + I(hp^10))
	qplot(data = mtcars, hp, mpg, geom="point") + geom_line(data = data.frame(hp=seq(50,335, by=3), mpg=predict(l, data.frame(hp=seq(50, 335, by=3)))), aes(x=hp, y=mpg))

	l <- lm(data = mtcars, mpg ~ hp + wt + cyl)
	summary(l)
	qplot(data = mtcars, x = wt, y = mpg, geom="point") + geom_smooth(method = 'lm')

	## ... Incidentally...
	merge(tbl, xl, by.x='Country', by.y = 'row.names', all = F)
	# Pivots
	library(reshape)
	cast(df, Category ~ ., value='Mandates', fun.aggregate = sum, na.rm = T)
	cast(df, Category + Large ~ ., value='Mandates', fun.aggregate = length)