mossmatters/intro_ggplot.R

## intro_ggplot.R
#install.packages(c('datasauRus',"ggplot2","dplyr"))
library(ggplot2)
library(datasauRus)
library(dplyr)

#Showing the importance of data visualization
# The anscombe dataset has four x and four y variables
data("anscombe")
anscombe

# We can create summary statistics about each pair

apply(anscombe,2,mean)
apply(anscombe,2,sd)
anscombe.cor = c(0,0,0,0)
for(i in 1:4){
  anscombe.cor[i] = cor(anscombe[,i],anscombe[,i+4])
}
anscombe.cor

# Each of the pairs of variables has nearly identical means, standard deviations, and correlations
# But there is something different about these four pairs of data

par(mfrow=c(2,2))
ggplot(anscombe,aes(x=x1,y=y1)) + geom_point() + expand_limits(x = 0, y = 0) + geom_smooth(method="lm")

# It was not until we plotted the data that we see any problem with it.
# This illustrates the importance of "exploratory visualization" to data science
# You can't always rely solely on summary statistics and fitting models!

# The code above uses the package ggplot2 to visualize the data
# ggplot2 uses the "grammar of graphics" to build plots for both exploratory analysis and for publication
# To learn more, follow this tutorial: http://www.rebeccabarter.com/blog/2017-11-17-ggplot2_tutorial/

# Assignment

# Resources
# ggplot2 cheat sheet: https://rstudio.com/wp-content/uploads/2015/03/ggplot2-cheatsheet.pdf

# The builtin dataset mpg gives fuel economy statistics for 38 popular models of cars

data("mpg")
mpg

# Your task is to produce visualizations that explore the data
#    Hint: You can use subset(mpg,factor=="value") to make a subset of the data
#       (for example, by manufacturer or year)

# Task 1: A scatter plot of city versus hwy, with different colors based on year (2008 or 1999).
#    Make the size of the points related the number of cylinders the engine has
#    Hint: use factor(variable) to turn a number into a discrete variable


# Task 2: A boxplot or violin plot showing mpg (city or hwy) across each year
#     Bonus: Add jittered points to the plot with size related to the cylinder


# Task 3: Make a facet plot showing cty or hwy separated by the number of cylinders


# Identify a hypothesis you could test with statistical models that emerges from the plots you have made.


# Another exploration of data with identical mean, sd, and corr but with more fun patterns

group_by(datasaurus_dozen,dataset)
x.mean = tapply(datasaurus_dozen$x,datasaurus_dozen$dataset,mean)
y.mean = tapply(datasaurus_dozen$y,datasaurus_dozen$dataset,mean)
x.sd = tapply(datasaurus_dozen$x,datasaurus_dozen$dataset,sd)
y.sd = tapply(datasaurus_dozen$y,datasaurus_dozen$dataset,sd)

ggplot(datasaurus_dozen, aes(x=x, y=y, colour=dataset))+
  geom_point()+
  theme_void()+
  theme(legend.position = "none")+
  facet_wrap(~dataset, ncol=3)
	#install.packages(c('datasauRus',"ggplot2","dplyr"))
	library(ggplot2)
	library(datasauRus)
	library(dplyr)

	#Showing the importance of data visualization
	# The anscombe dataset has four x and four y variables
	data("anscombe")
	anscombe

	# We can create summary statistics about each pair

	apply(anscombe,2,mean)
	apply(anscombe,2,sd)
	anscombe.cor = c(0,0,0,0)
	for(i in 1:4){
	anscombe.cor[i] = cor(anscombe[,i],anscombe[,i+4])
	}
	anscombe.cor

	# Each of the pairs of variables has nearly identical means, standard deviations, and correlations
	# But there is something different about these four pairs of data

	par(mfrow=c(2,2))
	ggplot(anscombe,aes(x=x1,y=y1)) + geom_point() + expand_limits(x = 0, y = 0) + geom_smooth(method="lm")

	# It was not until we plotted the data that we see any problem with it.
	# This illustrates the importance of "exploratory visualization" to data science
	# You can't always rely solely on summary statistics and fitting models!

	# The code above uses the package ggplot2 to visualize the data
	# ggplot2 uses the "grammar of graphics" to build plots for both exploratory analysis and for publication
	# To learn more, follow this tutorial: http://www.rebeccabarter.com/blog/2017-11-17-ggplot2_tutorial/

	# Assignment

	# Resources
	# ggplot2 cheat sheet: https://rstudio.com/wp-content/uploads/2015/03/ggplot2-cheatsheet.pdf

	# The builtin dataset mpg gives fuel economy statistics for 38 popular models of cars

	data("mpg")
	mpg

	# Your task is to produce visualizations that explore the data
	# Hint: You can use subset(mpg,factor=="value") to make a subset of the data
	# (for example, by manufacturer or year)

	# Task 1: A scatter plot of city versus hwy, with different colors based on year (2008 or 1999).
	# Make the size of the points related the number of cylinders the engine has
	# Hint: use factor(variable) to turn a number into a discrete variable



	# Task 2: A boxplot or violin plot showing mpg (city or hwy) across each year
	# Bonus: Add jittered points to the plot with size related to the cylinder


	# Task 3: Make a facet plot showing cty or hwy separated by the number of cylinders


	# Identify a hypothesis you could test with statistical models that emerges from the plots you have made.



	# Another exploration of data with identical mean, sd, and corr but with more fun patterns

	group_by(datasaurus_dozen,dataset)
	x.mean = tapply(datasaurus_dozen$x,datasaurus_dozen$dataset,mean)
	y.mean = tapply(datasaurus_dozen$y,datasaurus_dozen$dataset,mean)
	x.sd = tapply(datasaurus_dozen$x,datasaurus_dozen$dataset,sd)
	y.sd = tapply(datasaurus_dozen$y,datasaurus_dozen$dataset,sd)

	ggplot(datasaurus_dozen, aes(x=x, y=y, colour=dataset))+
	geom_point()+
	theme_void()+
	theme(legend.position = "none")+
	facet_wrap(~dataset, ncol=3)