k8hertweck/IntroTidyverse.R

## IntroTidyverse.md

      
    Raw
  

              IntroTidyverse.md
            
          
    Introduction to tidyverse

URL to this page: bit.ly/UseRSeattleOct2019
Seattle UseR meetup, October 2019
Get the data:
big_epa_cars <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-10-15/big_epa_cars.csv")


Tidy Tuesday post here
Data dictionary here
tidyverse cheatsheets


## IntroTidyverse.R
library(tidyverse)
big_epa_cars <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-10-15/big_epa_cars.csv")

head(big_epa_cars)
str(big_epa_cars)
colnames(big_epa_cars)

# selecting columns with dplyr
sel_columns <- select(big_epa_cars, fuelType, city08, co2)
# select range of columns
sel_range <- select(big_epa_cars, barrels08:charge240)
# select columns starting with the same thing
sel_starts <- select(big_epa_cars, starts_with("comb"))
# also ends_with(), contains()
# select rows conditionally
filtered_rows <- filter(big_epa_cars, fuelType == "Electricity")

## Challenge: create a new object from big_epa_cars called first_cols that includes only the first four columns

## Challenge: create a new object from big_epa_cars called city_cols that includes all columns containing "city"

## Challenge: create a new object from big_epa_cars called fuel_cols that includes the following columns: fuelType, drive, cylinders, year

## Challenge: create a new object from big_epa_cars called rear_rows that includes all cars with Rear-Wheel Drive (from column drive)

#### Combining commands ####

# use intermediate objects to combine commands (answer from previous challenge)
fuel_cols <- select(big_epa_cars, fuelType, drive, cylinders, year)
rear_rows <- filter(fuel_cols, drive == "Rear-Wheel Drive")

# nest commands (same object as created above, but here only in two lines)
rear_fuel <- filter(select(big_epa_cars, fuelType, drive, cylinders, year), drive == "Rear-Wheel Drive")

# combine commands using pipes (improves readability of complex commands)
# same example as above
piped <- big_epa_cars %>%
  select(fuelType, drive, guzzler, year) %>%
  filter(drive == "Rear-Wheel Drive")
# extract race, ethinicity, and disease from cases born prior to 1930
piped2 <- big_epa_cars %>%
  filter(cylinders > 8) %>%
  select(guzzler, year)
# does the order of commands differ?
piped3 <- big_epa_cars %>%
  select(guzzler, year) %>%
  filter(cylinders > 8)
# in this case, yes it does matter!

## Challenge: Use pipes to extract the columns model, co2, and range from the object big_epa_cars for guzzlers (TRUE) that were manufactured after 2000 (column year)

big_epa_cars %>%
  filter(guzzler == TRUE) %>%
  filter(year > 2000) %>%
  select(model, co2, range)

#### Mutate ####

# mutate allows unit conversions or ratios, creates a new column
# convert miles to km
city_mpg_km <- big_epa_cars %>%
  mutate(city_km = city08 * 1.6)
# convert two columns at once, send to head for easier viewing
big_epa_cars %>%
  mutate(city_km = city08 * 1.6,
         comb_km = comb08 * 1.6) %>%
  head()

## Challenge: extract only cars that are model Corolla or Prius and create a new column called diff_mpg representing the difference between highway mpg (highway08) and city mpg (city08)

big_epa_cars %>%
  mutate(diff_mpg = highway08 - city08) %>%
  filter(model == "Corolla") %>%
  select(diff_mpg)

#### Split-apply-combine ####

# frame the problem: we want to summarize data by types of drive

# show categories in drive
distinct(big_epa_cars, drive)

# group_by not always useful by itself, but powerful together with tally()
# count number of individuals with each tumor stage
big_epa_cars %>%
  group_by(drive) %>%
  tally() # empty parentheses not required, but good practice
# shows missing data, too

# the split/apply/combine approach:
# split data into groups,
# apply an analysis to each group,
# combine results back into one object

# summarize average city mpg by drive
big_epa_cars %>%
  group_by(drive) %>%
  summarize(mean_city08 = mean(city08, na.rm = TRUE))
# why doesn't the above work to remove NA?

# remove NA, add visualization
big_epa_cars %>%
  filter(!is.na(drive)) %>%
  group_by(drive) %>%
  summarize(mean_city08 = mean(city08, na.rm = TRUE)) %>%
  ggplot() +
    geom_col(aes(x=drive, y=mean_city08))

# more complex
big_epa_cars %>%
  filter(!is.na(drive)) %>%
  group_by(drive, model) %>%
  summarize(mean_city08 = mean(city08, na.rm = TRUE)) %>%
  ggplot() +
    geom_boxplot(aes(x=drive, y=mean_city08))
	library(tidyverse)
	big_epa_cars <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-10-15/big_epa_cars.csv")

	head(big_epa_cars)
	str(big_epa_cars)
	colnames(big_epa_cars)

	# selecting columns with dplyr
	sel_columns <- select(big_epa_cars, fuelType, city08, co2)
	# select range of columns
	sel_range <- select(big_epa_cars, barrels08:charge240)
	# select columns starting with the same thing
	sel_starts <- select(big_epa_cars, starts_with("comb"))
	# also ends_with(), contains()
	# select rows conditionally
	filtered_rows <- filter(big_epa_cars, fuelType == "Electricity")

	## Challenge: create a new object from big_epa_cars called first_cols that includes only the first four columns

	## Challenge: create a new object from big_epa_cars called city_cols that includes all columns containing "city"

	## Challenge: create a new object from big_epa_cars called fuel_cols that includes the following columns: fuelType, drive, cylinders, year

	## Challenge: create a new object from big_epa_cars called rear_rows that includes all cars with Rear-Wheel Drive (from column drive)

	#### Combining commands ####

	# use intermediate objects to combine commands (answer from previous challenge)
	fuel_cols <- select(big_epa_cars, fuelType, drive, cylinders, year)
	rear_rows <- filter(fuel_cols, drive == "Rear-Wheel Drive")

	# nest commands (same object as created above, but here only in two lines)
	rear_fuel <- filter(select(big_epa_cars, fuelType, drive, cylinders, year), drive == "Rear-Wheel Drive")

	# combine commands using pipes (improves readability of complex commands)
	# same example as above
	piped <- big_epa_cars %>%
	select(fuelType, drive, guzzler, year) %>%
	filter(drive == "Rear-Wheel Drive")
	# extract race, ethinicity, and disease from cases born prior to 1930
	piped2 <- big_epa_cars %>%
	filter(cylinders > 8) %>%
	select(guzzler, year)
	# does the order of commands differ?
	piped3 <- big_epa_cars %>%
	select(guzzler, year) %>%
	filter(cylinders > 8)
	# in this case, yes it does matter!

	## Challenge: Use pipes to extract the columns model, co2, and range from the object big_epa_cars for guzzlers (TRUE) that were manufactured after 2000 (column year)

	big_epa_cars %>%
	filter(guzzler == TRUE) %>%
	filter(year > 2000) %>%
	select(model, co2, range)

	#### Mutate ####

	# mutate allows unit conversions or ratios, creates a new column
	# convert miles to km
	city_mpg_km <- big_epa_cars %>%
	mutate(city_km = city08 * 1.6)
	# convert two columns at once, send to head for easier viewing
	big_epa_cars %>%
	mutate(city_km = city08 * 1.6,
	comb_km = comb08 * 1.6) %>%
	head()

	## Challenge: extract only cars that are model Corolla or Prius and create a new column called diff_mpg representing the difference between highway mpg (highway08) and city mpg (city08)

	big_epa_cars %>%
	mutate(diff_mpg = highway08 - city08) %>%
	filter(model == "Corolla") %>%
	select(diff_mpg)

	#### Split-apply-combine ####

	# frame the problem: we want to summarize data by types of drive

	# show categories in drive
	distinct(big_epa_cars, drive)

	# group_by not always useful by itself, but powerful together with tally()
	# count number of individuals with each tumor stage
	big_epa_cars %>%
	group_by(drive) %>%
	tally() # empty parentheses not required, but good practice
	# shows missing data, too

	# the split/apply/combine approach:
	# split data into groups,
	# apply an analysis to each group,
	# combine results back into one object

	# summarize average city mpg by drive
	big_epa_cars %>%
	group_by(drive) %>%
	summarize(mean_city08 = mean(city08, na.rm = TRUE))
	# why doesn't the above work to remove NA?

	# remove NA, add visualization
	big_epa_cars %>%
	filter(!is.na(drive)) %>%
	group_by(drive) %>%
	summarize(mean_city08 = mean(city08, na.rm = TRUE)) %>%
	ggplot() +
	geom_col(aes(x=drive, y=mean_city08))

	# more complex
	big_epa_cars %>%
	filter(!is.na(drive)) %>%
	group_by(drive, model) %>%
	summarize(mean_city08 = mean(city08, na.rm = TRUE)) %>%
	ggplot() +
	geom_boxplot(aes(x=drive, y=mean_city08))