tomschenkjr/reshape-tutorial.R

## reshape-tutorial.R
# Install reshape package. You will be asked to pick a server.
install.packages("reshape")

# We're going to also use ggplot2, so let's install that as well:
install.packages(c("reshape","ggplot2")) # In R, c() concatenates inputs as a vector

# You will always need to load the libraries after installing the package.
library(reshape)
library(ggplot2)

# First, we're going to work with the "mtcars" data set.
str(mtcars) # Explore the data structure.
rownames(mtcars) # Each row does have a car make/model.

## Reshape was inspired by other base R functions that summarize data. Let's explore these first.
# summary() is the most basic data summary
summary(mtcars)

# The apply() function will summarize columns or rows.
apply(mtcars, 2, mean) # Average by column, other common named functions are median, sd, length.
apply(mtcars, 2, stats::quantile)
apply(mtcars, 2, function(x) x^2) # Can define your own function.
apply(mtcars, 1, mean) # Average by row, but this doesn't make sense -- why not?

# The by() function allows you to create summary data by su
by(mtcars$mpg, mtcars$cyl, mean) # What is the average mpg by the number of car cylinders.
by(mtcars[ ,c(1,3:5)], mtcars$cyl, mean) # Average mpg, disp, hp, and drat by number of cylinders.

# The by() function is a simplification of tapply()
tapply(mtcars$mpg, mtcars$cyl, mean)

# The table() function provides a simple count of elements.
table(mtcars$cyl)
table(mtcars$cyl, table$gear) # 2 x 2 table.

## Reshape involves two steps, one to "melt" the data. Let's ask the question, how does the car shape and performance vary with cylinders?

# Melt the data
mtcars.melt <- melt(mtcars, id.var="cyl")

# Inspect molten data
str(mtcars.melt)
head(mtcars.melt)
tail(mtcars.melt)

## Each row of data is transformed into multiple rows, each row represents a variable for each observed car cylinder.

## Now we can cast the data into a reshaped table.
cast(mtcars.melt, cyl ~ variable, mean) # Average for all variables for each cylinder.
cast(mtcars.melt, cyl ~ variable, sd) # Standard deviation
cast(mtcars.melt, cyl ~ variable, stats::quantile) # Quantiles

## Let's ask a slightly different question: What is the relationship between the cylinders and horsepower?
# All of these are equivalent
mtcars.hp.melt <- melt(mtcars, measure.var="hp")
mtcars.hp.melt <- melt(mtcars, id.var = -4)
mtcars.hp.melt <- melt(mtcars, id.var=c("mpg", "cyl", "disp", "hp", "drat", "wt", "qsec", "vs", "am", "gear", "carb"))

# Again cast the data to reshape
cast(mtcars.hp.melt, cyl ~ variable, mean) # hp is our melted variable

# Sometimes it's handy to store the casted data.
mtcars.hp.cast.mean <- cast(mtcars.hp.melt, cyl ~ variable, mean) # Mean
mtcars.hp.cast.sd <- cast(mtcars.hp.melt, cyl ~ variable, sd) # Standard Deviation
mtcars.hp.cast.length <- cast(mtcars.hp.melt, cyl ~ variable, length) # Length

# Now we can combine them into one summary table
mtcars.hp.cyl.descriptives <- data.frame(mtcars.hp.cast.mean, mtcars.hp.cast.sd[ ,2], mtcars.hp.cast.length[ ,3])
names(mtcars.hp.cyl.descriptives) <- c("Cylinders", "Average Horsepower", "Horsepower Standard Deviation", "Number of cars")

# An easier way to do this is in the casting
cast(mtcars.hp.melt, cyl ~ variable, c(mean, sd, length))

# Now we can plot it as well
plot(mtcars.hp.cyl.descriptives$Cylinders, mtcars.hp.cyl.descriptives$"Number of cars") # Better to just use ggplot, but it shows how to do it.

## Which organization had the most FOIA (Freedom of Information Act) requests from the Law department?
# Read data from City of Chicago
foia <- read.csv("http://data.cityofchicago.org/api/views/44bx-ncpi/rows.csv")

# Quickest way to answer our question is the table function
table(foia$ORGANIZATION) # But this can be used for plotting.

# Cast to make into a proper table
foia.melt <- melt(foia, id.var="ORGANIZATION")
foia.cast <- cast(foia.melt, ORGANIZATION ~ variable, length) # The "." summarizes all variables instead of listing each column.

# Now we can graph it.
names(foia.cast)[2] <- "Requests"
ggplot(foia.cast) + geom_bar(aes(x=ORGANIZATION, y=Requests))

# A little messy, so let's order them
foia.cast[order(foia.cast$Requests), ] # Oops, reverse order
foia.cast.order <- foia.cast[order(-foia.cast$Requests), ]
foia.cast.order.topfive <- foia.cast.order[1:5, ]
ggplot(foia.cast.order.topfive) + geom_bar(aes(x=ORGANIZATION, y=Requests)) # Plot the top 5 requesting agencies.

## But what are the total number of requests?

# Quickest answer is:
nrow(foia) # But doesn't show any of the organizations.

cast(foia.melt, ORGANIZATION ~ variable, length, margins="grand_row") # Same answer

## So, I'm lazy and I hate all of these columns. Four of them display the exact same data. Let's fix that.
cast(foia.melt, ORGANIZATION ~ ., function(x) length(x)/4, margins="grand_row")

## What aspects of student and school district characteristics correlate to test scores?
# Load data
ed <- read.csv("http://www.ats.ucla.edu/stat/r/faq/hsb2.csv")

# Inspect data
head(ed)
str(ed)

# Melt data
ed.melt <- melt(ed, measure.var(c("read", "write", "math", "science", "socst"))

# Cast data showing relationship between females and test scores
ed.cast <- cast(ed.melt, female ~ variable, mean)
cast(ed.melt, female + race ~ variable, mean) # Add race variable
cast(ed.melt, female + race ~ ses + variable, mean) # Add socio-economic status

# This is getting to be pretty high dimension, so let's make a list
cast(ed.melt, female ~ race | variable, mean)

# We can make a heatmap showing test scores for each student.
ggplot(ed.melt, aes(x = variable, y=id)) + geom_tile(aes(fill=value, color="white") + scale_fill_gradient(low="white", high="steelblue")
	# Install reshape package. You will be asked to pick a server.
	install.packages("reshape")

	# We're going to also use ggplot2, so let's install that as well:
	install.packages(c("reshape","ggplot2")) # In R, c() concatenates inputs as a vector

	# You will always need to load the libraries after installing the package.
	library(reshape)
	library(ggplot2)

	# First, we're going to work with the "mtcars" data set.
	str(mtcars) # Explore the data structure.
	rownames(mtcars) # Each row does have a car make/model.

	## Reshape was inspired by other base R functions that summarize data. Let's explore these first.
	# summary() is the most basic data summary
	summary(mtcars)

	# The apply() function will summarize columns or rows.
	apply(mtcars, 2, mean) # Average by column, other common named functions are median, sd, length.
	apply(mtcars, 2, stats::quantile)
	apply(mtcars, 2, function(x) x^2) # Can define your own function.
	apply(mtcars, 1, mean) # Average by row, but this doesn't make sense -- why not?

	# The by() function allows you to create summary data by su
	by(mtcars$mpg, mtcars$cyl, mean) # What is the average mpg by the number of car cylinders.
	by(mtcars[ ,c(1,3:5)], mtcars$cyl, mean) # Average mpg, disp, hp, and drat by number of cylinders.

	# The by() function is a simplification of tapply()
	tapply(mtcars$mpg, mtcars$cyl, mean)

	# The table() function provides a simple count of elements.
	table(mtcars$cyl)
	table(mtcars$cyl, table$gear) # 2 x 2 table.

	## Reshape involves two steps, one to "melt" the data. Let's ask the question, how does the car shape and performance vary with cylinders?

	# Melt the data
	mtcars.melt <- melt(mtcars, id.var="cyl")

	# Inspect molten data
	str(mtcars.melt)
	head(mtcars.melt)
	tail(mtcars.melt)

	## Each row of data is transformed into multiple rows, each row represents a variable for each observed car cylinder.

	## Now we can cast the data into a reshaped table.
	cast(mtcars.melt, cyl ~ variable, mean) # Average for all variables for each cylinder.
	cast(mtcars.melt, cyl ~ variable, sd) # Standard deviation
	cast(mtcars.melt, cyl ~ variable, stats::quantile) # Quantiles

	## Let's ask a slightly different question: What is the relationship between the cylinders and horsepower?
	# All of these are equivalent
	mtcars.hp.melt <- melt(mtcars, measure.var="hp")
	mtcars.hp.melt <- melt(mtcars, id.var = -4)
	mtcars.hp.melt <- melt(mtcars, id.var=c("mpg", "cyl", "disp", "hp", "drat", "wt", "qsec", "vs", "am", "gear", "carb"))

	# Again cast the data to reshape
	cast(mtcars.hp.melt, cyl ~ variable, mean) # hp is our melted variable

	# Sometimes it's handy to store the casted data.
	mtcars.hp.cast.mean <- cast(mtcars.hp.melt, cyl ~ variable, mean) # Mean
	mtcars.hp.cast.sd <- cast(mtcars.hp.melt, cyl ~ variable, sd) # Standard Deviation
	mtcars.hp.cast.length <- cast(mtcars.hp.melt, cyl ~ variable, length) # Length

	# Now we can combine them into one summary table
	mtcars.hp.cyl.descriptives <- data.frame(mtcars.hp.cast.mean, mtcars.hp.cast.sd[ ,2], mtcars.hp.cast.length[ ,3])
	names(mtcars.hp.cyl.descriptives) <- c("Cylinders", "Average Horsepower", "Horsepower Standard Deviation", "Number of cars")

	# An easier way to do this is in the casting
	cast(mtcars.hp.melt, cyl ~ variable, c(mean, sd, length))

	# Now we can plot it as well
	plot(mtcars.hp.cyl.descriptives$Cylinders, mtcars.hp.cyl.descriptives$"Number of cars") # Better to just use ggplot, but it shows how to do it.

	## Which organization had the most FOIA (Freedom of Information Act) requests from the Law department?
	# Read data from City of Chicago
	foia <- read.csv("http://data.cityofchicago.org/api/views/44bx-ncpi/rows.csv")

	# Quickest way to answer our question is the table function
	table(foia$ORGANIZATION) # But this can be used for plotting.

	# Cast to make into a proper table
	foia.melt <- melt(foia, id.var="ORGANIZATION")
	foia.cast <- cast(foia.melt, ORGANIZATION ~ variable, length) # The "." summarizes all variables instead of listing each column.

	# Now we can graph it.
	names(foia.cast)[2] <- "Requests"
	ggplot(foia.cast) + geom_bar(aes(x=ORGANIZATION, y=Requests))

	# A little messy, so let's order them
	foia.cast[order(foia.cast$Requests), ] # Oops, reverse order
	foia.cast.order <- foia.cast[order(-foia.cast$Requests), ]
	foia.cast.order.topfive <- foia.cast.order[1:5, ]
	ggplot(foia.cast.order.topfive) + geom_bar(aes(x=ORGANIZATION, y=Requests)) # Plot the top 5 requesting agencies.

	## But what are the total number of requests?

	# Quickest answer is:
	nrow(foia) # But doesn't show any of the organizations.

	cast(foia.melt, ORGANIZATION ~ variable, length, margins="grand_row") # Same answer

	## So, I'm lazy and I hate all of these columns. Four of them display the exact same data. Let's fix that.
	cast(foia.melt, ORGANIZATION ~ ., function(x) length(x)/4, margins="grand_row")

	## What aspects of student and school district characteristics correlate to test scores?
	# Load data
	ed <- read.csv("http://www.ats.ucla.edu/stat/r/faq/hsb2.csv")

	# Inspect data
	head(ed)
	str(ed)

	# Melt data
	ed.melt <- melt(ed, measure.var(c("read", "write", "math", "science", "socst"))

	# Cast data showing relationship between females and test scores
	ed.cast <- cast(ed.melt, female ~ variable, mean)
	cast(ed.melt, female + race ~ variable, mean) # Add race variable
	cast(ed.melt, female + race ~ ses + variable, mean) # Add socio-economic status

	# This is getting to be pretty high dimension, so let's make a list
	cast(ed.melt, female ~ race \| variable, mean)

	# We can make a heatmap showing test scores for each student.
	ggplot(ed.melt, aes(x = variable, y=id)) + geom_tile(aes(fill=value, color="white") + scale_fill_gradient(low="white", high="steelblue")