library(tidyverse)
library(palmerpenguins)
penguins <- penguins |> drop_na()
# This splits the dataset into three smaller datasets behind the scenes, but
# then doesn't do anything with them. But secretly it's waiting to do things
# within the three groups (hence the "Groups: species [3]") note
penguins |>
group_by(species)
#> # A tibble: 333 × 8
#> # Groups: species [3]
#> species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
#> <fct> <fct> <dbl> <dbl> <int> <int>
#> 1 Adelie Torgersen 39.1 18.7 181 3750
#> 2 Adelie Torgersen 39.5 17.4 186 3800
#> 3 Adelie Torgersen 40.3 18 195 3250
#> 4 Adelie Torgersen 36.7 19.3 193 3450
#> 5 Adelie Torgersen 39.3 20.6 190 3650
#> 6 Adelie Torgersen 38.9 17.8 181 3625
#> 7 Adelie Torgersen 39.2 19.6 195 4675
#> 8 Adelie Torgersen 41.1 17.6 182 3200
#> 9 Adelie Torgersen 38.6 21.2 191 3800
#> 10 Adelie Torgersen 34.6 21.1 198 4400
#> # ℹ 323 more rows
#> # ℹ 2 more variables: sex <fct>, year <int>
# summarize() collapses each of the behind-the-scenes datasets into a single
# row. Once you use summarize() on a grouped dataset, dplyr will automatically
# ungroup. This has three rows, but no "Groups: blah" note
penguins |>
group_by(species) |>
summarize(total = n())
#> # A tibble: 3 × 2
#> species total
#> <fct> <int>
#> 1 Adelie 146
#> 2 Chinstrap 68
#> 3 Gentoo 119
# When grouping by multiple things, dplyr will automatically ungroup the last of
# the groups (i.e. the right-most group), but keep everything else grouped. This
# has six rows and is grouped by species (hence the "Groups: species [3]" and
# the extra "`summarise()` has grouped output by 'species'" message)
penguins |>
group_by(species, sex) |>
summarize(total = n())
#> `summarise()` has grouped output by 'species'. You can override using the
#> `.groups` argument.
#> # A tibble: 6 × 3
#> # Groups: species [3]
#> species sex total
#> <fct> <fct> <int>
#> 1 Adelie female 73
#> 2 Adelie male 73
#> 3 Chinstrap female 34
#> 4 Chinstrap male 34
#> 5 Gentoo female 58
#> 6 Gentoo male 61
# The same thing happens in reverse if we switch species and sex. The dataset is
# still grouped by sex
penguins |>
group_by(sex, species) |>
summarize(total = n())
#> `summarise()` has grouped output by 'sex'. You can override using the `.groups`
#> argument.
#> # A tibble: 6 × 3
#> # Groups: sex [2]
#> sex species total
#> <fct> <fct> <int>
#> 1 female Adelie 73
#> 2 female Chinstrap 34
#> 3 female Gentoo 58
#> 4 male Adelie 73
#> 5 male Chinstrap 34
#> 6 male Gentoo 61
# This gets important when doing things like mutate() on the summarized dataset.
# Like here, we'll create a proportion column that takes the total / sum(total)
# There are no groupings here, so the prop column adds up to 100%:
penguins |>
group_by(species) |>
summarize(total = n()) |>
mutate(prop = total / sum(total))
#> # A tibble: 3 × 3
#> species total prop
#> <fct> <int> <dbl>
#> 1 Adelie 146 0.438
#> 2 Chinstrap 68 0.204
#> 3 Gentoo 119 0.357
# Here, we group by two things, which creates behind-the-scenes datasets for all
# the six combinations of species and sex. When dplyr's done, it ungroups the
# sex group, but leaves the dataset grouped by species. The prop column no
# longer adds up to 100%; it adds to 300%. That's because it calculated
# total/sum(total) *within* each species group (so 50% of Adelies are female,
# 50% are male, etc.)
penguins |>
group_by(species, sex) |>
summarize(total = n()) |>
mutate(prop = total / sum(total))
#> `summarise()` has grouped output by 'species'. You can override using the
#> `.groups` argument.
#> # A tibble: 6 × 4
#> # Groups: species [3]
#> species sex total prop
#> <fct> <fct> <int> <dbl>
#> 1 Adelie female 73 0.5
#> 2 Adelie male 73 0.5
#> 3 Chinstrap female 34 0.5
#> 4 Chinstrap male 34 0.5
#> 5 Gentoo female 58 0.487
#> 6 Gentoo male 61 0.513
# If we reverse the grouping order so that sex comes first, dplyr will
# automatically stop grouping by species and keep the dataset grouped by sex.
# That means mutate() will work *within* each sex group, so the prop column here
# adds to 200%. 44% of female penguins are Adelies, 21% of female penguins are
# Chinstraps, and 35% of female penguins are Gentoos
penguins |>
group_by(sex, species) |>
summarize(total = n()) |>
mutate(prop = total / sum(total))
#> `summarise()` has grouped output by 'sex'. You can override using the `.groups`
#> argument.
#> # A tibble: 6 × 4
#> # Groups: sex [2]
#> sex species total prop
#> <fct> <fct> <int> <dbl>
#> 1 female Adelie 73 0.442
#> 2 female Chinstrap 34 0.206
#> 3 female Gentoo 58 0.352
#> 4 male Adelie 73 0.435
#> 5 male Chinstrap 34 0.202
#> 6 male Gentoo 61 0.363
# If we explicitly ungroup before calculating the proportion, then mutate() will
# work on the whole dataset instead of sex- or species-specific groups. Here,
# 22% of all penguins are female Adelies, 10% are female Chinstraps, etc.
penguins |>
group_by(sex, species) |>
summarize(total = n()) |>
ungroup() |>
mutate(prop = total / sum(total))
#> `summarise()` has grouped output by 'sex'. You can override using the `.groups`
#> argument.
#> # A tibble: 6 × 4
#> sex species total prop
#> <fct> <fct> <int> <dbl>
#> 1 female Adelie 73 0.219
#> 2 female Chinstrap 34 0.102
#> 3 female Gentoo 58 0.174
#> 4 male Adelie 73 0.219
#> 5 male Chinstrap 34 0.102
#> 6 male Gentoo 61 0.183
# We don't have to rely on dplyr's automatic ungroup-the-last-grouping feature
# and we can add our own grouping explicitly later. Like here, dplyr stops
# grouping by sex, which means that the prop column would add to 300%, showing
# the proportion of sexes within each species. But if we throw in a
# group_by(sex), it'll put everything in to two behind-the-scenes datasets (male
# and female) and calculate the proportion of species within each sex
penguins |>
group_by(species, sex) |>
summarize(total = n()) |>
group_by(sex) |>
mutate(prop = total / sum(total))
#> `summarise()` has grouped output by 'species'. You can override using the
#> `.groups` argument.
#> # A tibble: 6 × 4
#> # Groups: sex [2]
#> species sex total prop
#> <fct> <fct> <int> <dbl>
#> 1 Adelie female 73 0.442
#> 2 Adelie male 73 0.435
#> 3 Chinstrap female 34 0.206
#> 4 Chinstrap male 34 0.202
#> 5 Gentoo female 58 0.352
#> 6 Gentoo male 61 0.363
Created on 2024-04-01 with reprex v2.0.2