Skip to content

Instantly share code, notes, and snippets.

@jilmun
Created May 24, 2016 16:19
Show Gist options
  • Save jilmun/d2a7e6d053106f9c951d2e970edcb8e9 to your computer and use it in GitHub Desktop.
Save jilmun/d2a7e6d053106f9c951d2e970edcb8e9 to your computer and use it in GitHub Desktop.
# create dummy data -------------------------------------------------------
set.seed(1)
d <- data.frame(col1 = sample(letters[1:3], 10, replace=T),
col2 = sample(letters[24:26], 10, replace=T),
col3 = runif(10) * 10,
stringsAsFactors = FALSE)
d$col1 <- as.factor(d$col1)
d$col4 = d$col3 + runif(10)
d; str(d)
# col1 col2 col3 col4
# 1 a x 9.3470523 9.8291324
# 2 b x 2.1214252 2.7209910
# 3 b z 6.5167377 7.0102790
# 4 c y 1.2555510 1.4417686
# 5 a z 2.6722067 3.4995800
# 6 c y 3.8611409 4.5296077
# 7 c z 0.1339033 0.9281432
# 8 b z 3.8238796 3.9318232
# 9 b y 8.6969085 9.4206194
# 10 a z 3.4034900 3.8147644
# 'data.frame': 10 obs. of 4 variables:
# $ col1: Factor w/ 3 levels "a","b","c": 1 2 2 3 1 3 3 2 2 1
# $ col2: chr "x" "x" "z" "y" ...
# $ col3: num 9.35 2.12 6.52 1.26 2.67 ...
# $ col4: num 9.83 2.72 7.01 1.44 3.5 ...
# data.matrix -------------------------------------------------------------
# data.matrix converts dataframe into numerical matrix
data.matrix(d) # string columns return NA
# col1 col2 col3 col4
# [1,] 1 NA 9.3470523 9.8291324
# [2,] 2 NA 2.1214252 2.7209910
# [3,] 2 NA 6.5167377 7.0102790
# [4,] 3 NA 1.2555510 1.4417686
# [5,] 1 NA 2.6722067 3.4995800
# [6,] 3 NA 3.8611409 4.5296077
# [7,] 3 NA 0.1339033 0.9281432
# [8,] 2 NA 3.8238796 3.9318232
# [9,] 2 NA 8.6969085 9.4206194
# [10,] 1 NA 3.4034900 3.8147644
# model.matrix ------------------------------------------------------------
model.matrix(~col3, data=d)
# (Intercept) col3
# 1 1 9.3470523
# 2 1 2.1214252
# 3 1 6.5167377
# 4 1 1.2555510
# 5 1 2.6722067
# 6 1 3.8611409
# 7 1 0.1339033
# 8 1 3.8238796
# 9 1 8.6969085
# 10 1 3.4034900
# attr(,"assign")
# [1] 0 1
model.matrix(col4~col3, data=d) # left side is ignored
# (Intercept) col3
# 1 1 9.3470523
# 2 1 2.1214252
# 3 1 6.5167377
# 4 1 1.2555510
# 5 1 2.6722067
# 6 1 3.8611409
# 7 1 0.1339033
# 8 1 3.8238796
# 9 1 8.6969085
# 10 1 3.4034900
# attr(,"assign")
# [1] 0 1
model.matrix(~col1+col2, data=d) # intercept replaces 1 column (col1a)
# (Intercept) col1b col1c col2y col2z
# 1 1 0 0 0 0
# 2 1 1 0 0 0
# 3 1 1 0 0 1
# 4 1 0 1 1 0
# 5 1 0 0 0 1
# 6 1 0 1 1 0
# 7 1 0 1 0 1
# 8 1 1 0 0 1
# 9 1 1 0 1 0
# 10 1 0 0 0 1
# attr(,"assign")
# [1] 0 1 1 2 2
# attr(,"contrasts")
# attr(,"contrasts")$col1
# [1] "contr.treatment"
#
# attr(,"contrasts")$col2
# [1] "contr.treatment"
model.matrix(~col1+col2-1, data=d) # "+0" or "-1" drops intercept
# col1a col1b col1c col2y col2z
# 1 1 0 0 0 0
# 2 0 1 0 0 0
# 3 0 1 0 0 1
# 4 0 0 1 1 0
# 5 1 0 0 0 1
# 6 0 0 1 1 0
# 7 0 0 1 0 1
# 8 0 1 0 0 1
# 9 0 1 0 1 0
# 10 1 0 0 0 1
# attr(,"assign")
# [1] 1 1 1 2 2
# attr(,"contrasts")
# attr(,"contrasts")$col1
# [1] "contr.treatment"
#
# attr(,"contrasts")$col2
# [1] "contr.treatment"
model.matrix(col4~.+0-col3, data=d) # same results as above
# col1a col1b col1c col2y col2z
# 1 1 0 0 0 0
# 2 0 1 0 0 0
# 3 0 1 0 0 1
# 4 0 0 1 1 0
# 5 1 0 0 0 1
# 6 0 0 1 1 0
# 7 0 0 1 0 1
# 8 0 1 0 0 1
# 9 0 1 0 1 0
# 10 1 0 0 0 1
# attr(,"assign")
# [1] 1 1 1 2 2
# attr(,"contrasts")
# attr(,"contrasts")$col1
# [1] "contr.treatment"
#
# attr(,"contrasts")$col2
# [1] "contr.treatment"
# one hot encoding --------------------------------------------------------
# create new contrast function with 'contrasts=FALSE'
contr.onehot <- function (n, contrasts, sparse=FALSE) {
contr.sum(n=n, contrasts=FALSE, sparse=sparse)
}
# set options
options(contrasts = c("contr.onehot", "contr.onehot"))
# use model.matrix function
model.matrix(~col1+col2-1, data=d)
# col1a col1b col1c col2x col2y col2z
# 1 1 0 0 1 0 0
# 2 0 1 0 1 0 0
# 3 0 1 0 0 0 1
# 4 0 0 1 0 1 0
# 5 1 0 0 0 0 1
# 6 0 0 1 0 1 0
# 7 0 0 1 0 0 1
# 8 0 1 0 0 0 1
# 9 0 1 0 0 1 0
# 10 1 0 0 0 0 1
# attr(,"assign")
# [1] 1 1 1 2 2 2
# attr(,"contrasts")
# attr(,"contrasts")$col1
# [1] "contr.onehot"
#
# attr(,"contrasts")$col2
# [1] "contr.onehot"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment