Last active
June 8, 2017 16:37
-
-
Save davidski/89daf9f3ae00a15208c6 to your computer and use it in GitHub Desktop.
Coursera RStudio Ozone Example
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
The Elements of Data Analytic Style | |
Jeff Leek | |
https://leanpub.com/datastyle/ | |
R Programming for Data Science | |
Roger Peng | |
https://leanpub.com/rprogramming | |
The Art of Data Science | |
Roger Peng and Elizabeth Matsui | |
https://leanpub.com/artofdatascience |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Name: ozone_example.R | |
# Author: David F. Severski | |
# Purpose: Demo of ozone data analysis, lifted from Roger Peng's | |
# "The Art of Data Science" | |
library(readr) | |
library(dplyr) | |
library(ggplot2) | |
# Fetch Data ------------------------------ | |
url <- "http://aqsdr1.epa.gov/aqsweb/aqstmp/airdata/hourly_44201_2014.zip" | |
if (!dir.exists("./data")) { dir.create("./data") } | |
if (!file.exists("./data/hourly_44201_2014.zip")) { | |
download.file(url, "./data/hourly_44201_2014.zip", quiet = TRUE) | |
} | |
unzip("./data/hourly_44201_2014.zip", exdir = "./data") | |
# Read data ------------------------------ | |
# col types derived from http://aqsdr1.epa.gov/aqsweb/aqstmp/airdata/FileFormats.html | |
ozone <- read_csv("data/hourly_44201_2014.csv", | |
col_types = "ccccinnccccccnccnccccccc") | |
# clean up names | |
names(ozone) <- make.names(names(ozone)) | |
# Fri Sep 18 08:12:58 2015 ------------------------------ | |
# Verify data | |
nrow(ozone) | |
ncol(ozone) | |
str(ozone) | |
glimpse(ozone) | |
head(ozone[, c(6:7, 10)]) | |
tail(ozone[, c(6:7, 10)]) | |
# Examine data ------------------------------ | |
head(table(ozone$Time.Local)) | |
filter(ozone, Time.Local =="02:00") %>% | |
select(State.Name, County.Name, Date.Local, Time.Local, Sample.Measurement) | |
ozone %>% | |
filter(County.Name=="King" & State.Name=="Washington" & Date.Local=="2014-09-30") %>% | |
select(Date.Local, Time.Local, Sample.Measurement) %>% | |
as.data.frame | |
# More Examination ------------------------------ | |
select(ozone, State.Name) %>% unique %>% nrow | |
unique(ozone$State.Name) | |
#interesting, Mexico, Puerto Rico, and DC are included | |
summary(ozone$Sample.Measurement) | |
quantile(ozone$Sample.Measurement, seq(0, 1, by=0.1)) | |
# look at WA | |
ozone <- ozone %>% mutate(Is.Washington = State.Name == "Washington") | |
# Visualize Data ------------------------------ | |
gg <- ggplot(ozone, aes(x=State.Name, y=Sample.Measurement)) | |
gg <- gg + geom_boxplot() | |
gg <- gg + ylab("Ozone Level (ppm)") | |
gg <- gg + ggtitle("Boxplot of Ozone Values by State") | |
gg <- gg + theme_minimal() | |
gg <- gg + theme(axis.text.x = element_text(angle=30, hjust=1, vjust=1)) | |
gg | |
#Let's highlight WA only | |
gg + geom_boxplot(mapping=aes(fill=Is.Washington)) | |
# Compare East vs West ------------------------------ | |
ozone$region <- factor(ifelse(ozone$Longitude < -100, "west", "east")) | |
ozone %>% group_by(region, State.Name) %>% tally %>% as.data.frame | |
us_only_ozone <- ozone %>% | |
filter(State.Name != "Country of Mexico") %>% | |
group_by(region) %>% | |
summarise(mean = mean(Sample.Measurement, na.rm = TRUE), | |
median = median(Sample.Measurement, na.rm = TRUE)) | |
gg <- ozone %>% filter(State.Name != "Country of Mexico") %>% ggplot(aes(x=region, y=Sample.Measurement)) | |
gg <- gg + geom_boxplot() | |
gg <- gg + theme_minimal() | |
gg <- gg + ylab("Ozone Level (ppm)") | |
gg <- gg + ggtitle("Boxplot of Ozone Values by Region") | |
gg |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment