andrewheiss/marginaleffects-logged-predictors.md

## marginaleffects-logged-predictors.md

      
    Raw
  

              marginaleffects-logged-predictors.md
            
          
    library(tidyverse)
library(marginaleffects)
library(gapminder)

gapminder_2007 <- gapminder |> 
  filter(year == 2007)

# Use log() in the model formula
model <- lm(lifeExp ~ log(gdpPercap), data = gapminder_2007)

# Trying to find the slopes at these points
plot_predictions(model, condition = "gdpPercap") +
  geom_vline(xintercept = c(500, 5000, 10000, 20000, 40000))

# If I feed non-logged GDP values into slopes(), those estimates look wrong
model |> 
  slopes(newdata = datagrid(gdpPercap = c(500, 5000, 10000, 20000, 40000)))
#> 
#>       Term gdpPercap Estimate Std. Error    z Pr(>|z|)     S    2.5 %   97.5 %
#>  gdpPercap       500  0.01441   8.85e-04 16.3   <0.001 195.6 0.012672 0.016140
#>  gdpPercap      5000  0.00144   8.85e-05 16.3   <0.001 195.6 0.001267 0.001614
#>  gdpPercap     10000  0.00072   4.42e-05 16.3   <0.001 195.6 0.000634 0.000807
#>  gdpPercap     20000  0.00036   2.21e-05 16.3   <0.001 195.5 0.000317 0.000404
#>  gdpPercap     40000  0.00018   1.11e-05 16.3   <0.001 195.5 0.000158 0.000202
#> 
#> Columns: rowid, term, estimate, std.error, statistic, p.value, s.value, conf.low, conf.high, gdpPercap, predicted_lo, predicted_hi, predicted, lifeExp 
#> Type:  response

# If I feed logged GDP values into slopes(), these estimates seem right (but the
# gdpPercap column is ugly with values like 6.21, 8.52, etc.)
model |> 
  slopes(newdata = datagrid(gdpPercap = log(c(500, 5000, 10000, 20000, 40000))))
#> 
#>       Term gdpPercap Estimate Std. Error    z Pr(>|z|)     S 2.5 % 97.5 %
#>  gdpPercap      6.21    1.226     0.0753 16.3   <0.001 195.6 1.078  1.373
#>  gdpPercap      8.52    0.870     0.0534 16.3   <0.001 195.6 0.766  0.975
#>  gdpPercap      9.21    0.801     0.0492 16.3   <0.001 195.6 0.705  0.898
#>  gdpPercap      9.90    0.743     0.0456 16.3   <0.001 195.6 0.653  0.832
#>  gdpPercap     10.60    0.692     0.0425 16.3   <0.001 195.6 0.609  0.776
#> 
#> Columns: rowid, term, estimate, std.error, statistic, p.value, s.value, conf.low, conf.high, gdpPercap, predicted_lo, predicted_hi, predicted, lifeExp 
#> Type:  response

# I can exponentiate the values, though:
model |> 
  slopes(newdata = datagrid(gdpPercap = log(c(500, 5000, 10000, 20000, 40000)))) |> 
  mutate(gdpPercap = exp(gdpPercap)) |> 
  as_tibble() |>  # The changed column disappears from the data.table printing :shrug:
  select(term, gdpPercap, estimate, std.error)
#> # A tibble: 5 × 4
#>   term      gdpPercap estimate std.error
#>   <chr>         <dbl>    <dbl>     <dbl>
#> 1 gdpPercap      500     1.23     0.0753
#> 2 gdpPercap     5000.    0.870    0.0534
#> 3 gdpPercap    10000.    0.801    0.0492
#> 4 gdpPercap    20000.    0.743    0.0456
#> 5 gdpPercap    40000.    0.692    0.0425


# BUUUUT it gets weird with predictions() too. It seems like non-logged values are needed here:
model |> 
  predictions(newdata = datagrid(gdpPercap = c(500, 5000, 10000, 20000, 40000)))
#> 
#>  gdpPercap Estimate Std. Error     z Pr(>|z|)   S 2.5 % 97.5 %
#>        500     49.7      1.219  40.8   <0.001 Inf  47.3   52.1
#>       5000     66.3      0.599 110.6   <0.001 Inf  65.1   67.5
#>      10000     71.3      0.653 109.2   <0.001 Inf  70.0   72.6
#>      20000     76.3      0.826  92.4   <0.001 Inf  74.7   77.9
#>      40000     81.3      1.061  76.6   <0.001 Inf  79.2   83.4
#> 
#> Columns: rowid, estimate, std.error, statistic, p.value, s.value, conf.low, conf.high, gdpPercap, lifeExp 
#> Type:  response

# If they're logged, the predictions are wrong:
model |> 
  predictions(newdata = datagrid(gdpPercap = log(c(500, 5000, 10000, 20000, 40000))))
#> 
#>  gdpPercap Estimate Std. Error    z Pr(>|z|)    S 2.5 % 97.5 %
#>       6.21     18.1       3.06 5.91   <0.001 28.2  12.1   24.1
#>       8.52     20.4       2.93 6.97   <0.001 38.2  14.6   26.1
#>       9.21     20.9       2.89 7.24   <0.001 41.1  15.3   26.6
#>       9.90     21.5       2.86 7.51   <0.001 43.9  15.9   27.1
#>      10.60     22.0       2.83 7.75   <0.001 46.7  16.4   27.5
#> 
#> Columns: rowid, estimate, std.error, statistic, p.value, s.value, conf.low, conf.high, gdpPercap, lifeExp 
#> Type:  response
^{Created on 2024-04-24 with reprex v2.0.2}