Skip to content

Instantly share code, notes, and snippets.

@k8hertweck
Last active October 18, 2019 02:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save k8hertweck/b53b7914867eb32672c5b341b5b774d6 to your computer and use it in GitHub Desktop.
Save k8hertweck/b53b7914867eb32672c5b341b5b774d6 to your computer and use it in GitHub Desktop.
Seattle UseR: Intro to tidyverse, October 2019

Introduction to tidyverse

URL to this page: bit.ly/UseRSeattleOct2019

Seattle UseR meetup, October 2019

Get the data:

big_epa_cars <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-10-15/big_epa_cars.csv")
library(tidyverse)
big_epa_cars <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-10-15/big_epa_cars.csv")
head(big_epa_cars)
str(big_epa_cars)
colnames(big_epa_cars)
# selecting columns with dplyr
sel_columns <- select(big_epa_cars, fuelType, city08, co2)
# select range of columns
sel_range <- select(big_epa_cars, barrels08:charge240)
# select columns starting with the same thing
sel_starts <- select(big_epa_cars, starts_with("comb"))
# also ends_with(), contains()
# select rows conditionally
filtered_rows <- filter(big_epa_cars, fuelType == "Electricity")
## Challenge: create a new object from big_epa_cars called first_cols that includes only the first four columns
## Challenge: create a new object from big_epa_cars called city_cols that includes all columns containing "city"
## Challenge: create a new object from big_epa_cars called fuel_cols that includes the following columns: fuelType, drive, cylinders, year
## Challenge: create a new object from big_epa_cars called rear_rows that includes all cars with Rear-Wheel Drive (from column drive)
#### Combining commands ####
# use intermediate objects to combine commands (answer from previous challenge)
fuel_cols <- select(big_epa_cars, fuelType, drive, cylinders, year)
rear_rows <- filter(fuel_cols, drive == "Rear-Wheel Drive")
# nest commands (same object as created above, but here only in two lines)
rear_fuel <- filter(select(big_epa_cars, fuelType, drive, cylinders, year), drive == "Rear-Wheel Drive")
# combine commands using pipes (improves readability of complex commands)
# same example as above
piped <- big_epa_cars %>%
select(fuelType, drive, guzzler, year) %>%
filter(drive == "Rear-Wheel Drive")
# extract race, ethinicity, and disease from cases born prior to 1930
piped2 <- big_epa_cars %>%
filter(cylinders > 8) %>%
select(guzzler, year)
# does the order of commands differ?
piped3 <- big_epa_cars %>%
select(guzzler, year) %>%
filter(cylinders > 8)
# in this case, yes it does matter!
## Challenge: Use pipes to extract the columns model, co2, and range from the object big_epa_cars for guzzlers (TRUE) that were manufactured after 2000 (column year)
big_epa_cars %>%
filter(guzzler == TRUE) %>%
filter(year > 2000) %>%
select(model, co2, range)
#### Mutate ####
# mutate allows unit conversions or ratios, creates a new column
# convert miles to km
city_mpg_km <- big_epa_cars %>%
mutate(city_km = city08 * 1.6)
# convert two columns at once, send to head for easier viewing
big_epa_cars %>%
mutate(city_km = city08 * 1.6,
comb_km = comb08 * 1.6) %>%
head()
## Challenge: extract only cars that are model Corolla or Prius and create a new column called diff_mpg representing the difference between highway mpg (highway08) and city mpg (city08)
big_epa_cars %>%
mutate(diff_mpg = highway08 - city08) %>%
filter(model == "Corolla") %>%
select(diff_mpg)
#### Split-apply-combine ####
# frame the problem: we want to summarize data by types of drive
# show categories in drive
distinct(big_epa_cars, drive)
# group_by not always useful by itself, but powerful together with tally()
# count number of individuals with each tumor stage
big_epa_cars %>%
group_by(drive) %>%
tally() # empty parentheses not required, but good practice
# shows missing data, too
# the split/apply/combine approach:
# split data into groups,
# apply an analysis to each group,
# combine results back into one object
# summarize average city mpg by drive
big_epa_cars %>%
group_by(drive) %>%
summarize(mean_city08 = mean(city08, na.rm = TRUE))
# why doesn't the above work to remove NA?
# remove NA, add visualization
big_epa_cars %>%
filter(!is.na(drive)) %>%
group_by(drive) %>%
summarize(mean_city08 = mean(city08, na.rm = TRUE)) %>%
ggplot() +
geom_col(aes(x=drive, y=mean_city08))
# more complex
big_epa_cars %>%
filter(!is.na(drive)) %>%
group_by(drive, model) %>%
summarize(mean_city08 = mean(city08, na.rm = TRUE)) %>%
ggplot() +
geom_boxplot(aes(x=drive, y=mean_city08))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment