https://stackoverflow.com/q/44550649/4926446?sem=2
I need to create a dummy variable (binary) from a character (string) variable The data that I have look like this:
dat <- tribble(
~pat_id, ~icd9_1, ~icd9_2,
1, "414.01", "414.01",
2, "411.89", NA,
3, NA, "410.71",
4, NA, NA,
5, NA, "410.51",
6, NA, "272.0, 410.71"
)
dat
# A tibble: 6 x 3
# pat_id icd9_1 icd9_2
# <dbl> <chr> <chr>
# 1 414.01 414.01
# 2 411.89 <NA>
# 3 <NA> 410.71
# 4 <NA> <NA>
# 5 <NA> 410.51
# 6 <NA> 272.0, 410.71
I want to create three new binary variables:
icd9_bin_1
== binary (0/1) for icd9_1
icd9_bin_2
== binary (0/1) for icd9_2
icd9_bin
== binary for either icd9_1
OR icd9_2
What is the fastest way to create these binary variables?
I've replaced NA
s with 0
, turned into a factor and then recoded, but that
took forever.
# get structure
dat$icd9_1 %>% str()
# get rid of NAs (replace with 0s)
dat$icd9_1[is.na(dat$icd9_1 )] <- 0
# turn into factor
dat$icd9_1 <- factor(dat$icd9_1)
# get levels
dat$icd9_1 %>% levels()
# use fct_collapse
dat %>%
mutate(icd9_bin_1 = fct_collapse(
icd9_1,
`icd9` = c("411.89","414.01"),
`no icd9 dx` = c("0")))
# A tibble: 6 x 4
# pat_id icd9_1 icd9_2 icd9_bin_1
# <dbl> <fctr> <chr> <fctr>
# 1 414.01 414.01 icd9
# 2 411.89 <NA> icd9
# 3 0 410.71 no icd9 dx
# 4 0 <NA> no icd9 dx
# 5 0 410.51 no icd9 dx
# 6 0 272.0, 410.71 no icd9 dx
I'm looking for a more elegant solution. Ideas?
# proposed solution 1 --------------------- @Phil
dat <- tribble(
~pat_id, ~icd9_1, ~icd9_2,
1, "414.01", "414.01",
2, "411.89", NA,
3, NA, "410.71",
4, NA, NA,
5, NA, "410.51",
6, NA, "272.0, 410.71"
)
dat
# Do you just need?
dat$icd9_bin_1 <- if_else(is.na(dat$icd9_1),"no icd9 dx", "icd9")
dat
# I'm tired, so I'm probably missing something...
# A tibble: 6 x 4
# pat_id icd9_1 icd9_2 icd9_bin_1
# <dbl> <chr> <chr> <chr>
# 1 1 414.01 414.01 icd9
# 2 2 411.89 <NA> icd9
# 3 3 <NA> 410.71 no icd9 dx
# 4 4 <NA> <NA> no icd9 dx
# 5 5 <NA> 410.51 no icd9 dx
# 6 6 <NA> 272.0, 410.71 no icd9 dx
# proposed solution 2 --------------------------- @Pierre Lafortune
dat <- tribble(
~pat_id, ~icd9_1, ~icd9_2,
1, "414.01", "414.01",
2, "411.89", NA,
3, NA, "410.71",
4, NA, NA,
5, NA, "410.51",
6, NA, "272.0, 410.71"
)
dat
dat[c('icd9_bin_1', 'icd9_bin_2')] <- paste(c('yes', 'no')[is.na(dat[-1]) + 1L],
rep(names(dat[-1]), each = nrow(dat)), sep = '-')
dat
# proposed solution 3 ------------------------------ @pyll
# to generalize pierre solution do this
# create data.frame
pat_id <- (1:6)
icd9_1 <- c("414.01", "411.89", NA, NA, NA, NA)
icd9_2 <- c("414.01", NA, "410.71", NA, "410.51", "272.0, 410.71")
icd9_3 <- c(NA, NA, '3', NA, NA, NA)
dat <- data.frame(pat_id, icd9_1, icd9_2, icd9_3)
dat
# solution
dat[names(dat[,-1])] <- paste(c('yes', 'no')[is.na(dat[-1]) + 1L],
rep(names(dat[-1]), each = nrow(dat)), sep = '-')
dat
# pat_id icd9_1 icd9_2 icd9_3
# 1 1 yes-icd9_1 yes-icd9_2 no-icd9_3
# 2 2 yes-icd9_1 no-icd9_2 no-icd9_3
# 3 3 no-icd9_1 yes-icd9_2 yes-icd9_3
# 4 4 no-icd9_1 no-icd9_2 no-icd9_3
# 5 5 no-icd9_1 yes-icd9_2 no-icd9_3
# 6 6 no-icd9_1 yes-icd9_2 no-icd9_3