library(tidyverse)
library(marginaleffects)
library(broom)
library(palmerpenguins)
penguins <- penguins %>% drop_na(sex)
# Marginal means in conjoint world are just the averages for each of the levels in a factor
# variable included in a regression model (i.e. instead of using an omitted reference
# category, you get estimates for all the levels, just like when using y ~ 0 + categorical_x)
#
# See Leeper, Hobolt, and Tilley 2019: https://doi.org/10.1017/pan.2019.30
#
cregg::mm(penguins, formula = body_mass_g ~ species + sex)
#> outcome statistic feature level estimate std.error z p lower upper
#> 1 body_mass_g mm species Adelie 3706.164 37.88239 97.83344 0 3631.916 3780.412
#> 2 body_mass_g mm species Chinstrap 3733.088 46.33312 80.57061 0 3642.277 3823.899
#> 3 body_mass_g mm species Gentoo 5092.437 45.84557 111.07805 0 5002.581 5182.293
#> 4 body_mass_g mm sex female 3862.273 51.78184 74.58740 0 3760.782 3963.763
#> 5 body_mass_g mm sex male 4545.685 60.67694 74.91618 0 4426.760 4664.609
# In looking at Thomas's code for mm(), all that's really happening is there are
# individual intercept-less models for each categorical predictor. This does the
# exact same thing:
bind_rows(
tidy(lm(body_mass_g ~ 0 + species, data = penguins)),
tidy(lm(body_mass_g ~ 0 + sex, data = penguins))
)
#> # A tibble: 5 × 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 speciesAdelie 3706. 38.1 97.2 6.88e-245
#> 2 speciesChinstrap 3733. 55.9 66.8 8.16e-194
#> 3 speciesGentoo 5092. 42.2 121. 6.31e-275
#> 4 sexfemale 3862. 56.8 68.0 1.70e-196
#> 5 sexmale 4546. 56.3 80.7 8.39e-220
# And they're the same values you'd get from just doing group averages
penguins %>% group_by(species) %>% summarize(avg = mean(body_mass_g))
#> # A tibble: 3 × 2
#> species avg
#> <fct> <dbl>
#> 1 Adelie 3706.
#> 2 Chinstrap 3733.
#> 3 Gentoo 5092.
penguins %>% group_by(sex) %>% summarize(avg = mean(body_mass_g))
#> # A tibble: 2 × 2
#> sex avg
#> <fct> <dbl>
#> 1 female 3862.
#> 2 male 4546.
# With marginaleffects::marginal_means(), the estimates are a little off from
# what cregg:mm() does, but that's because conjoint world thinks of marginal
# means differently (i.e. the whole 0 + species, 0 + sex, 0 + whatever approach
# to individual group averages)
#
# Hooray for everyone naming different things the same thing.
#
# But these results are so close to mm()! And conceptually they're doing roughly
# the same thing—finding averages of factor levels while holding other things constant
model_penguin <- lm(body_mass_g ~ species + sex, data = penguins)
marginal_means(model_penguin, variables = c("species", "sex"))
#>
#> Term Value Mean Std. Error z Pr(>|z|) 2.5 % 97.5 %
#> species Adelie 3706 26.2 141.4 <0.001 3655 3758
#> species Gentoo 5084 29.0 175.1 <0.001 5027 5141
#> species Chinstrap 3733 38.4 97.2 <0.001 3658 3808
#> sex male 4508 25.1 179.7 <0.001 4459 4557
#> sex female 3841 25.3 151.8 <0.001 3791 3890
#>
#> Results averaged over levels of: species, sex
#> Columns: term, value, estimate, std.error, statistic, p.value, conf.low, conf.high
# Is there a way, with one of {marginaleffects} functions like predictions() or
# marginal_means() or whatever, to get cregg::mm() style group averages (i.e.
# *only* calculating spcies means, then *only* calculating sex means, etc.)
#
# From the documentation at
# https://vincentarelbundock.github.io/marginaleffects/articles/marginalmeans.html
# it seems like that's what marginal_means() is supposed to be doing
#
# WAIT the wts argument does this! Setting wts = "cells" fixes it:
marginal_means(model_penguin, newdata = c("species", "sex"), wts = "cells")
#>
#> Term Value Mean Std. Error z Pr(>|z|) 2.5 % 97.5 %
#> species Adelie 3706 26.2 141.4 <0.001 3655 3758
#> species Chinstrap 3733 38.4 97.2 <0.001 3658 3808
#> species Gentoo 5092 29.0 175.5 <0.001 5036 5149
#> sex female 3862 24.6 156.7 <0.001 3814 3911
#> sex male 4546 24.4 186.1 <0.001 4498 4594
#>
#> Results averaged over levels of: species, sex
#> Columns: term, value, estimate, std.error, statistic, p.value, conf.low, conf.high
# ↑ these are the same now ↓
bind_rows(
tidy(lm(body_mass_g ~ 0 + species, data = penguins)),
tidy(lm(body_mass_g ~ 0 + sex, data = penguins))
)
#> # A tibble: 5 × 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 speciesAdelie 3706. 38.1 97.2 6.88e-245
#> 2 speciesChinstrap 3733. 55.9 66.8 8.16e-194
#> 3 speciesGentoo 5092. 42.2 121. 6.31e-275
#> 4 sexfemale 3862. 56.8 68.0 1.70e-196
#> 5 sexmale 4546. 56.3 80.7 8.39e-220
Created on 2023-07-21 with reprex v2.0.2