Skip to content

Instantly share code, notes, and snippets.

@davidski
Last active June 8, 2017 16:37
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save davidski/89daf9f3ae00a15208c6 to your computer and use it in GitHub Desktop.
Save davidski/89daf9f3ae00a15208c6 to your computer and use it in GitHub Desktop.
Coursera RStudio Ozone Example
The Elements of Data Analytic Style
Jeff Leek
https://leanpub.com/datastyle/
R Programming for Data Science
Roger Peng
https://leanpub.com/rprogramming
The Art of Data Science
Roger Peng and Elizabeth Matsui
https://leanpub.com/artofdatascience
# Name: ozone_example.R
# Author: David F. Severski
# Purpose: Demo of ozone data analysis, lifted from Roger Peng's
# "The Art of Data Science"
library(readr)
library(dplyr)
library(ggplot2)
# Fetch Data ------------------------------
url <- "http://aqsdr1.epa.gov/aqsweb/aqstmp/airdata/hourly_44201_2014.zip"
if (!dir.exists("./data")) { dir.create("./data") }
if (!file.exists("./data/hourly_44201_2014.zip")) {
download.file(url, "./data/hourly_44201_2014.zip", quiet = TRUE)
}
unzip("./data/hourly_44201_2014.zip", exdir = "./data")
# Read data ------------------------------
# col types derived from http://aqsdr1.epa.gov/aqsweb/aqstmp/airdata/FileFormats.html
ozone <- read_csv("data/hourly_44201_2014.csv",
col_types = "ccccinnccccccnccnccccccc")
# clean up names
names(ozone) <- make.names(names(ozone))
# Fri Sep 18 08:12:58 2015 ------------------------------
# Verify data
nrow(ozone)
ncol(ozone)
str(ozone)
glimpse(ozone)
head(ozone[, c(6:7, 10)])
tail(ozone[, c(6:7, 10)])
# Examine data ------------------------------
head(table(ozone$Time.Local))
filter(ozone, Time.Local =="02:00") %>%
select(State.Name, County.Name, Date.Local, Time.Local, Sample.Measurement)
ozone %>%
filter(County.Name=="King" & State.Name=="Washington" & Date.Local=="2014-09-30") %>%
select(Date.Local, Time.Local, Sample.Measurement) %>%
as.data.frame
# More Examination ------------------------------
select(ozone, State.Name) %>% unique %>% nrow
unique(ozone$State.Name)
#interesting, Mexico, Puerto Rico, and DC are included
summary(ozone$Sample.Measurement)
quantile(ozone$Sample.Measurement, seq(0, 1, by=0.1))
# look at WA
ozone <- ozone %>% mutate(Is.Washington = State.Name == "Washington")
# Visualize Data ------------------------------
gg <- ggplot(ozone, aes(x=State.Name, y=Sample.Measurement))
gg <- gg + geom_boxplot()
gg <- gg + ylab("Ozone Level (ppm)")
gg <- gg + ggtitle("Boxplot of Ozone Values by State")
gg <- gg + theme_minimal()
gg <- gg + theme(axis.text.x = element_text(angle=30, hjust=1, vjust=1))
gg
#Let's highlight WA only
gg + geom_boxplot(mapping=aes(fill=Is.Washington))
# Compare East vs West ------------------------------
ozone$region <- factor(ifelse(ozone$Longitude < -100, "west", "east"))
ozone %>% group_by(region, State.Name) %>% tally %>% as.data.frame
us_only_ozone <- ozone %>%
filter(State.Name != "Country of Mexico") %>%
group_by(region) %>%
summarise(mean = mean(Sample.Measurement, na.rm = TRUE),
median = median(Sample.Measurement, na.rm = TRUE))
gg <- ozone %>% filter(State.Name != "Country of Mexico") %>% ggplot(aes(x=region, y=Sample.Measurement))
gg <- gg + geom_boxplot()
gg <- gg + theme_minimal()
gg <- gg + ylab("Ozone Level (ppm)")
gg <- gg + ggtitle("Boxplot of Ozone Values by Region")
gg
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment