library(tidyverse)
# Example data
df <- tibble(org_id = 1:10,
outcome = 1:10,
issue_1 = c("A", "B", "C", "D", "E", "F", "G", "H", "I", "J"),
issue_2 = c(NA, NA, "X", NA, "Y", NA, NA, NA, NA, "Z"))
df
#> # A tibble: 10 × 4
#> org_id outcome issue_1 issue_2
#> <int> <int> <chr> <chr>
#> 1 1 1 A <NA>
#> 2 2 2 B <NA>
#> 3 3 3 C X
#> 4 4 4 D <NA>
#> 5 5 5 E Y
#> 6 6 6 F <NA>
#> 7 7 7 G <NA>
#> 8 8 8 H <NA>
#> 9 9 9 I <NA>
#> 10 10 10 J Z
# Convert to long and remove missing values when there's no issue_2
df_long <- df |>
pivot_longer(c(issue_1, issue_2), names_to = "issue_number", values_to = "issue") |>
filter(!is.na(issue))
df_long
#> # A tibble: 13 × 4
#> org_id outcome issue_number issue
#> <int> <int> <chr> <chr>
#> 1 1 1 issue_1 A
#> 2 2 2 issue_1 B
#> 3 3 3 issue_1 C
#> 4 3 3 issue_2 X
#> 5 4 4 issue_1 D
#> 6 5 5 issue_1 E
#> 7 5 5 issue_2 Y
#> 8 6 6 issue_1 F
#> 9 7 7 issue_1 G
#> 10 8 8 issue_1 H
#> 11 9 9 issue_1 I
#> 12 10 10 issue_1 J
#> 13 10 10 issue_2 Z
# Model with single issue column
lm(outcome ~ issue, data = df_long)
#>
#> Call:
#> lm(formula = outcome ~ issue, data = df_long)
#>
#> Coefficients:
#> (Intercept) issueB issueC issueD issueE issueF
#> 1 1 2 3 4 5
#> issueG issueH issueI issueJ issueX issueY
#> 6 7 8 9 2 4
#> issueZ
#> 9
# One-hot encoding
df_wide <- df %>%
# Make the data long
pivot_longer(
cols = starts_with("issue"),
names_to = "issue_col", values_to = "issue_val",
values_drop_na = TRUE
) %>%
# Add an "issue_" prefix to all the values
mutate(issue_val = paste0("issue_", issue_val)) %>%
# Make the data wide with a column per issue, with each column being true if
# the organization works on that issue and false otherwise
pivot_wider(
names_from = issue_val, values_from = issue_val,
values_fn = \(x) !is.na(x), values_fill = FALSE,
id_cols = c("org_id", "outcome")
)
df_wide
#> # A tibble: 10 × 15
#> org_id outcome issue_A issue_B issue_C issue_X issue_D issue_E issue_Y
#> <int> <int> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl>
#> 1 1 1 TRUE FALSE FALSE FALSE FALSE FALSE FALSE
#> 2 2 2 FALSE TRUE FALSE FALSE FALSE FALSE FALSE
#> 3 3 3 FALSE FALSE TRUE TRUE FALSE FALSE FALSE
#> 4 4 4 FALSE FALSE FALSE FALSE TRUE FALSE FALSE
#> 5 5 5 FALSE FALSE FALSE FALSE FALSE TRUE TRUE
#> 6 6 6 FALSE FALSE FALSE FALSE FALSE FALSE FALSE
#> 7 7 7 FALSE FALSE FALSE FALSE FALSE FALSE FALSE
#> 8 8 8 FALSE FALSE FALSE FALSE FALSE FALSE FALSE
#> 9 9 9 FALSE FALSE FALSE FALSE FALSE FALSE FALSE
#> 10 10 10 FALSE FALSE FALSE FALSE FALSE FALSE FALSE
#> # ℹ 6 more variables: issue_F <lgl>, issue_G <lgl>, issue_H <lgl>,
#> # issue_I <lgl>, issue_J <lgl>, issue_Z <lgl>
# Model with lots of issue_columns
lm(outcome ~ issue_A + issue_B + issue_C + issue_X + issue_D + issue_E +
issue_Y + issue_F + issue_G + issue_H + issue_I + issue_J + issue_Z,
data = df_wide)
#>
#> Call:
#> lm(formula = outcome ~ issue_A + issue_B + issue_C + issue_X +
#> issue_D + issue_E + issue_Y + issue_F + issue_G + issue_H +
#> issue_I + issue_J + issue_Z, data = df_wide)
#>
#> Coefficients:
#> (Intercept) issue_ATRUE issue_BTRUE issue_CTRUE issue_XTRUE issue_DTRUE
#> 10 -9 -8 -7 NA -6
#> issue_ETRUE issue_YTRUE issue_FTRUE issue_GTRUE issue_HTRUE issue_ITRUE
#> -5 NA -4 -3 -2 -1
#> issue_JTRUE issue_ZTRUE
#> NA NA
Created on 2023-09-13 with reprex v2.0.2