Skip to content

Instantly share code, notes, and snippets.

@andrewheiss
Last active September 13, 2023 16:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save andrewheiss/ed7dd6a0d143d6773f13c111c2288f39 to your computer and use it in GitHub Desktop.
Save andrewheiss/ed7dd6a0d143d6773f13c111c2288f39 to your computer and use it in GitHub Desktop.
library(tidyverse)

# Example data
df <- tibble(org_id = 1:10,
  outcome = 1:10,
  issue_1 = c("A", "B", "C", "D", "E", "F", "G", "H", "I", "J"),
  issue_2 = c(NA, NA, "X", NA, "Y", NA, NA, NA, NA, "Z"))
df
#> # A tibble: 10 × 4
#>    org_id outcome issue_1 issue_2
#>     <int>   <int> <chr>   <chr>  
#>  1      1       1 A       <NA>   
#>  2      2       2 B       <NA>   
#>  3      3       3 C       X      
#>  4      4       4 D       <NA>   
#>  5      5       5 E       Y      
#>  6      6       6 F       <NA>   
#>  7      7       7 G       <NA>   
#>  8      8       8 H       <NA>   
#>  9      9       9 I       <NA>   
#> 10     10      10 J       Z

# Convert to long and remove missing values when there's no issue_2
df_long <- df |> 
  pivot_longer(c(issue_1, issue_2), names_to = "issue_number", values_to = "issue") |> 
  filter(!is.na(issue))
df_long
#> # A tibble: 13 × 4
#>    org_id outcome issue_number issue
#>     <int>   <int> <chr>        <chr>
#>  1      1       1 issue_1      A    
#>  2      2       2 issue_1      B    
#>  3      3       3 issue_1      C    
#>  4      3       3 issue_2      X    
#>  5      4       4 issue_1      D    
#>  6      5       5 issue_1      E    
#>  7      5       5 issue_2      Y    
#>  8      6       6 issue_1      F    
#>  9      7       7 issue_1      G    
#> 10      8       8 issue_1      H    
#> 11      9       9 issue_1      I    
#> 12     10      10 issue_1      J    
#> 13     10      10 issue_2      Z

# Model with single issue column
lm(outcome ~ issue, data = df_long)
#> 
#> Call:
#> lm(formula = outcome ~ issue, data = df_long)
#> 
#> Coefficients:
#> (Intercept)       issueB       issueC       issueD       issueE       issueF  
#>           1            1            2            3            4            5  
#>      issueG       issueH       issueI       issueJ       issueX       issueY  
#>           6            7            8            9            2            4  
#>      issueZ  
#>           9


# One-hot encoding
df_wide <- df %>%
  # Make the data long
  pivot_longer(
    cols = starts_with("issue"), 
    names_to = "issue_col", values_to = "issue_val", 
    values_drop_na = TRUE
  ) %>%
  # Add an "issue_" prefix to all the values
  mutate(issue_val = paste0("issue_", issue_val)) %>%
  # Make the data wide with a column per issue, with each column being true if
  # the organization works on that issue and false otherwise
  pivot_wider(
    names_from = issue_val, values_from = issue_val, 
    values_fn = \(x) !is.na(x), values_fill = FALSE, 
    id_cols = c("org_id", "outcome")
  )
df_wide
#> # A tibble: 10 × 15
#>    org_id outcome issue_A issue_B issue_C issue_X issue_D issue_E issue_Y
#>     <int>   <int> <lgl>   <lgl>   <lgl>   <lgl>   <lgl>   <lgl>   <lgl>  
#>  1      1       1 TRUE    FALSE   FALSE   FALSE   FALSE   FALSE   FALSE  
#>  2      2       2 FALSE   TRUE    FALSE   FALSE   FALSE   FALSE   FALSE  
#>  3      3       3 FALSE   FALSE   TRUE    TRUE    FALSE   FALSE   FALSE  
#>  4      4       4 FALSE   FALSE   FALSE   FALSE   TRUE    FALSE   FALSE  
#>  5      5       5 FALSE   FALSE   FALSE   FALSE   FALSE   TRUE    TRUE   
#>  6      6       6 FALSE   FALSE   FALSE   FALSE   FALSE   FALSE   FALSE  
#>  7      7       7 FALSE   FALSE   FALSE   FALSE   FALSE   FALSE   FALSE  
#>  8      8       8 FALSE   FALSE   FALSE   FALSE   FALSE   FALSE   FALSE  
#>  9      9       9 FALSE   FALSE   FALSE   FALSE   FALSE   FALSE   FALSE  
#> 10     10      10 FALSE   FALSE   FALSE   FALSE   FALSE   FALSE   FALSE  
#> # ℹ 6 more variables: issue_F <lgl>, issue_G <lgl>, issue_H <lgl>,
#> #   issue_I <lgl>, issue_J <lgl>, issue_Z <lgl>

# Model with lots of issue_columns
lm(outcome ~ issue_A + issue_B + issue_C + issue_X + issue_D + issue_E + 
    issue_Y + issue_F + issue_G + issue_H + issue_I + issue_J + issue_Z, 
  data = df_wide)
#> 
#> Call:
#> lm(formula = outcome ~ issue_A + issue_B + issue_C + issue_X + 
#>     issue_D + issue_E + issue_Y + issue_F + issue_G + issue_H + 
#>     issue_I + issue_J + issue_Z, data = df_wide)
#> 
#> Coefficients:
#> (Intercept)  issue_ATRUE  issue_BTRUE  issue_CTRUE  issue_XTRUE  issue_DTRUE  
#>          10           -9           -8           -7           NA           -6  
#> issue_ETRUE  issue_YTRUE  issue_FTRUE  issue_GTRUE  issue_HTRUE  issue_ITRUE  
#>          -5           NA           -4           -3           -2           -1  
#> issue_JTRUE  issue_ZTRUE  
#>          NA           NA

Created on 2023-09-13 with reprex v2.0.2

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment