|
library(tidyverse) |
|
big_epa_cars <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-10-15/big_epa_cars.csv") |
|
|
|
head(big_epa_cars) |
|
str(big_epa_cars) |
|
colnames(big_epa_cars) |
|
|
|
# selecting columns with dplyr |
|
sel_columns <- select(big_epa_cars, fuelType, city08, co2) |
|
# select range of columns |
|
sel_range <- select(big_epa_cars, barrels08:charge240) |
|
# select columns starting with the same thing |
|
sel_starts <- select(big_epa_cars, starts_with("comb")) |
|
# also ends_with(), contains() |
|
# select rows conditionally |
|
filtered_rows <- filter(big_epa_cars, fuelType == "Electricity") |
|
|
|
## Challenge: create a new object from big_epa_cars called first_cols that includes only the first four columns |
|
|
|
## Challenge: create a new object from big_epa_cars called city_cols that includes all columns containing "city" |
|
|
|
## Challenge: create a new object from big_epa_cars called fuel_cols that includes the following columns: fuelType, drive, cylinders, year |
|
|
|
## Challenge: create a new object from big_epa_cars called rear_rows that includes all cars with Rear-Wheel Drive (from column drive) |
|
|
|
#### Combining commands #### |
|
|
|
# use intermediate objects to combine commands (answer from previous challenge) |
|
fuel_cols <- select(big_epa_cars, fuelType, drive, cylinders, year) |
|
rear_rows <- filter(fuel_cols, drive == "Rear-Wheel Drive") |
|
|
|
# nest commands (same object as created above, but here only in two lines) |
|
rear_fuel <- filter(select(big_epa_cars, fuelType, drive, cylinders, year), drive == "Rear-Wheel Drive") |
|
|
|
# combine commands using pipes (improves readability of complex commands) |
|
# same example as above |
|
piped <- big_epa_cars %>% |
|
select(fuelType, drive, guzzler, year) %>% |
|
filter(drive == "Rear-Wheel Drive") |
|
# extract race, ethinicity, and disease from cases born prior to 1930 |
|
piped2 <- big_epa_cars %>% |
|
filter(cylinders > 8) %>% |
|
select(guzzler, year) |
|
# does the order of commands differ? |
|
piped3 <- big_epa_cars %>% |
|
select(guzzler, year) %>% |
|
filter(cylinders > 8) |
|
# in this case, yes it does matter! |
|
|
|
## Challenge: Use pipes to extract the columns model, co2, and range from the object big_epa_cars for guzzlers (TRUE) that were manufactured after 2000 (column year) |
|
|
|
big_epa_cars %>% |
|
filter(guzzler == TRUE) %>% |
|
filter(year > 2000) %>% |
|
select(model, co2, range) |
|
|
|
#### Mutate #### |
|
|
|
# mutate allows unit conversions or ratios, creates a new column |
|
# convert miles to km |
|
city_mpg_km <- big_epa_cars %>% |
|
mutate(city_km = city08 * 1.6) |
|
# convert two columns at once, send to head for easier viewing |
|
big_epa_cars %>% |
|
mutate(city_km = city08 * 1.6, |
|
comb_km = comb08 * 1.6) %>% |
|
head() |
|
|
|
## Challenge: extract only cars that are model Corolla or Prius and create a new column called diff_mpg representing the difference between highway mpg (highway08) and city mpg (city08) |
|
|
|
big_epa_cars %>% |
|
mutate(diff_mpg = highway08 - city08) %>% |
|
filter(model == "Corolla") %>% |
|
select(diff_mpg) |
|
|
|
#### Split-apply-combine #### |
|
|
|
# frame the problem: we want to summarize data by types of drive |
|
|
|
# show categories in drive |
|
distinct(big_epa_cars, drive) |
|
|
|
# group_by not always useful by itself, but powerful together with tally() |
|
# count number of individuals with each tumor stage |
|
big_epa_cars %>% |
|
group_by(drive) %>% |
|
tally() # empty parentheses not required, but good practice |
|
# shows missing data, too |
|
|
|
# the split/apply/combine approach: |
|
# split data into groups, |
|
# apply an analysis to each group, |
|
# combine results back into one object |
|
|
|
# summarize average city mpg by drive |
|
big_epa_cars %>% |
|
group_by(drive) %>% |
|
summarize(mean_city08 = mean(city08, na.rm = TRUE)) |
|
# why doesn't the above work to remove NA? |
|
|
|
# remove NA, add visualization |
|
big_epa_cars %>% |
|
filter(!is.na(drive)) %>% |
|
group_by(drive) %>% |
|
summarize(mean_city08 = mean(city08, na.rm = TRUE)) %>% |
|
ggplot() + |
|
geom_col(aes(x=drive, y=mean_city08)) |
|
|
|
# more complex |
|
big_epa_cars %>% |
|
filter(!is.na(drive)) %>% |
|
group_by(drive, model) %>% |
|
summarize(mean_city08 = mean(city08, na.rm = TRUE)) %>% |
|
ggplot() + |
|
geom_boxplot(aes(x=drive, y=mean_city08)) |