title | output | editor_options | ||||||||
---|---|---|---|---|---|---|---|---|---|---|
Predict type = 'terms' |
|
|
my_cars <- MASS::Cars93[, c('Price', 'Weight', 'RPM')]
head(my_cars)
## Price Weight RPM
## 1 15.9 2705 6300
## 2 33.9 3560 5500
## 3 29.1 3375 5500
## 4 37.7 3405 5500
## 5 30.0 3640 5700
## 6 15.7 2880 5200
fit <- lm(Price ~ Weight + RPM, data = my_cars)
coef(fit)
## (Intercept) Weight RPM
## -48.686662750 0.012930622 0.005389832
When you use 2 variables you'll see that predict will create a single vector of new predicted values (that makes sense).
a <- predict(fit, newdata = my_cars,type = "response")
str(a)
## Named num [1:93] 20.2 27 24.6 25 29.1 ...
## - attr(*, "names")= chr [1:93] "1" "2" "3" "4" ...
When you use type='terms' you instead get the individual predictions for each variable in the model.
b <- predict(fit, newdata = my_cars,type = "terms")
str(b)
## num [1:93, 1:2] -4.76 6.3 3.91 4.29 7.33 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:93] "1" "2" "3" "4" ...
## ..$ : chr [1:2] "Weight" "RPM"
## - attr(*, "constant")= num 19.5
head(b)
## Weight RPM
## 1 -4.757218 5.4941512
## 2 6.298464 1.1822857
## 3 3.906299 1.1822857
## 4 4.294218 1.1822857
## 5 7.332914 2.2602521
## 6 -2.494359 -0.4346639
attr(b, 'constant')
## [1] 19.50968
You can actually get the response values back by rowSum the matrix (from type='terms') and then adding in the constant. Which is found in the 'constant' attribute (note this is not the same as the model intercept).
my_cars$terms_calculation <- rowSums(b) + attr(b, 'constant')
my_cars$response_values <- a
head(my_cars)
## Price Weight RPM terms_calculation response_values
## 1 15.9 2705 6300 20.24661 20.24661
## 2 33.9 3560 5500 26.99043 26.99043
## 3 29.1 3375 5500 24.59826 24.59826
## 4 37.7 3405 5500 24.98618 24.98618
## 5 30.0 3640 5700 29.10284 29.10284
## 6 15.7 2880 5200 16.58065 16.58065
How are these values calulated?
It seems like if we scale
our data we can reproduce the numbers
# create data
set.seed(42)
n = 20
x = rnorm(n)
y = 2*x + 1 + rnorm(n)
# fit model
lm1 = lm(y ~ x)
# scale the x
x_scale = scale(x, scale = FALSE)
# predict on scaled values with type=terms
predicted <- predict(lm1, type = 'terms')
predicted
## x
## 1 2.7935936
## 2 -1.7927182
## 3 0.4056582
## 4 1.0447618
## 5 0.5031345
## 6 -0.7061816
## 7 3.1266424
## 8 -0.6790155
## 9 4.3276867
## 10 -0.6033257
## 11 2.6370039
## 12 4.9632066
## 13 -3.7454749
## 14 -1.1152894
## 15 -0.7706213
## 16 1.0520780
## 17 -1.1282361
## 18 -6.7488922
## 19 -6.2371328
## 20 2.6731220
## attr(,"constant")
## [1] 1.112848
The predicted values (type='terms') is the scale values * the model coefficients
# note i am not including the intercept here
calculated <- x_scale * lm1$coefficients[2]
calculated
## [,1]
## [1,] 2.7935936
## [2,] -1.7927182
## [3,] 0.4056582
## [4,] 1.0447618
## [5,] 0.5031345
## [6,] -0.7061816
## [7,] 3.1266424
## [8,] -0.6790155
## [9,] 4.3276867
## [10,] -0.6033257
## [11,] 2.6370039
## [12,] 4.9632066
## [13,] -3.7454749
## [14,] -1.1152894
## [15,] -0.7706213
## [16,] 1.0520780
## [17,] -1.1282361
## [18,] -6.7488922
## [19,] -6.2371328
## [20,] 2.6731220
## attr(,"scaled:center")
## [1] 0.19192
all(as.data.frame(predicted) == calculated)
## [1] TRUE
The constant: original intercept + sum of all the betas times the colmeans of the x's
calculated_constant <- lm1$coefficients[1] + lm1$coefficients[2] * mean(x)
calculated_constant
## (Intercept)
## 1.112848
calculated_constant == attr(predicted, 'constant')
## (Intercept)
## TRUE
If we take the above process and apply it to the cars93 dataset and fit it with 2 predictors, we can't fully reproduce the results.
my_cars <- MASS::Cars93[1:20, c('Price', 'Weight', 'RPM')]
fit <- lm(Price ~ Weight + RPM, data = my_cars)
# scale the weight
scaled <- scale(my_cars[-1])
predicted <- predict(fit, type = 'terms')
predicted
## Weight RPM
## 1 -10.1371543 9.2836501
## 2 1.9468707 3.3657855
## 3 -0.6678014 3.3657855
## 4 -0.2438005 3.3657855
## 5 3.0775397 4.8452516
## 6 -7.6638159 1.1465863
## 7 0.6748681 -1.8123460
## 8 9.6495533 -7.7302106
## 9 1.0282021 -1.8123460
## 10 2.7948725 -6.9904775
## 11 7.2468817 7.0644509
## 12 -13.1758273 1.1465863
## 13 -9.0064853 1.1465863
## 14 -2.5758053 -3.2918122
## 15 -3.2118067 1.1465863
## 16 4.1375419 -1.8123460
## 17 8.5188843 -7.7302106
## 18 6.8935476 -6.2507445
## 19 -0.5971346 -0.3328799
## 20 1.3108694 1.8863193
## attr(,"constant")
## [1] 23.59
calculated <- scaled[, 1] * fit$coefficients[2]
calculated
## 1 2 3 4 5
## -0.022939195 0.004405541 -0.001511156 -0.000551692 0.006964113
## 6 7 8 9 10
## -0.017342319 0.001527148 0.021835811 0.002326701 0.006324470
## 11 12 13 14 15
## 0.016398846 -0.029815357 -0.020380623 -0.005828746 -0.007267943
## 16 17 18 19 20
## 0.009362774 0.019277239 0.015599292 -0.001351246 0.002966344
all(as.data.frame(predicted)$Weight == calculated)
## [1] FALSE
For some reason we're off by a constant
predicted[, 1] / calculated ## off by a constant
## 1 2 3 4 5 6 7 8
## 441.9141 441.9141 441.9141 441.9141 441.9141 441.9141 441.9141 441.9141
## 9 10 11 12 13 14 15 16
## 441.9141 441.9141 441.9141 441.9141 441.9141 441.9141 441.9141 441.9141
## 17 18 19 20
## 441.9141 441.9141 441.9141 441.9141
# for fit
predicted[, 2] / (scaled[, 2] * fit$coefficients[3])
## 1 2 3 4 5 6 7 8
## 643.5714 643.5714 643.5714 643.5714 643.5714 643.5714 643.5714 643.5714
## 9 10 11 12 13 14 15 16
## 643.5714 643.5714 643.5714 643.5714 643.5714 643.5714 643.5714 643.5714
## 17 18 19 20
## 643.5714 643.5714 643.5714 643.5714
The values are off by a constant, which is the scale value
attr(scaled, 'scaled:scale')
## Weight RPM
## 441.9141 643.5714
So the questoin is, how are these values calculated?
# create data
set.seed(42)
n = 20
x = rnorm(n)
y = 2*x + 1 + rnorm(n)
# fit model
glm1 = glm(y ~ x)
# scale the x
x_scale = scale(x, scale = FALSE)
# predict on scaled values with type=terms
predicted <- predict(glm1, type = 'terms')
predicted
## x
## 1 2.7935936
## 2 -1.7927182
## 3 0.4056582
## 4 1.0447618
## 5 0.5031345
## 6 -0.7061816
## 7 3.1266424
## 8 -0.6790155
## 9 4.3276867
## 10 -0.6033257
## 11 2.6370039
## 12 4.9632066
## 13 -3.7454749
## 14 -1.1152894
## 15 -0.7706213
## 16 1.0520780
## 17 -1.1282361
## 18 -6.7488922
## 19 -6.2371328
## 20 2.6731220
## attr(,"constant")
## [1] 1.112848
calculated <- x_scale * glm1$coefficients[2]
calculated
## [,1]
## [1,] 2.7935936
## [2,] -1.7927182
## [3,] 0.4056582
## [4,] 1.0447618
## [5,] 0.5031345
## [6,] -0.7061816
## [7,] 3.1266424
## [8,] -0.6790155
## [9,] 4.3276867
## [10,] -0.6033257
## [11,] 2.6370039
## [12,] 4.9632066
## [13,] -3.7454749
## [14,] -1.1152894
## [15,] -0.7706213
## [16,] 1.0520780
## [17,] -1.1282361
## [18,] -6.7488922
## [19,] -6.2371328
## [20,] 2.6731220
## attr(,"scaled:center")
## [1] 0.19192
all(as.data.frame(predicted) == calculated)
## [1] TRUE
# the constant:
# original intercept + sum of all the betas times the colmeans of the x's
calculated_constant <- glm1$coefficients[1] + glm1$coefficients[2] * mean(x)
calculated_constant
## (Intercept)
## 1.112848
calculated_constant == attr(predicted, 'constant')
## (Intercept)
## TRUE
my_cars <- MASS::Cars93[1:20, c('Price', 'Weight', 'RPM')]
fit <- glm(Price ~ Weight + RPM, data = my_cars)
# scale the weight
scaled <- scale(my_cars[-1])
predicted <- predict(fit, type = 'terms')
predicted
## Weight RPM
## 1 -10.1371543 9.2836501
## 2 1.9468707 3.3657855
## 3 -0.6678014 3.3657855
## 4 -0.2438005 3.3657855
## 5 3.0775397 4.8452516
## 6 -7.6638159 1.1465863
## 7 0.6748681 -1.8123460
## 8 9.6495533 -7.7302106
## 9 1.0282021 -1.8123460
## 10 2.7948725 -6.9904775
## 11 7.2468817 7.0644509
## 12 -13.1758273 1.1465863
## 13 -9.0064853 1.1465863
## 14 -2.5758053 -3.2918122
## 15 -3.2118067 1.1465863
## 16 4.1375419 -1.8123460
## 17 8.5188843 -7.7302106
## 18 6.8935476 -6.2507445
## 19 -0.5971346 -0.3328799
## 20 1.3108694 1.8863193
## attr(,"constant")
## [1] 23.59
calculated <- scaled[, 1] * fit$coefficients[2]
calculated
## 1 2 3 4 5
## -0.022939195 0.004405541 -0.001511156 -0.000551692 0.006964113
## 6 7 8 9 10
## -0.017342319 0.001527148 0.021835811 0.002326701 0.006324470
## 11 12 13 14 15
## 0.016398846 -0.029815357 -0.020380623 -0.005828746 -0.007267943
## 16 17 18 19 20
## 0.009362774 0.019277239 0.015599292 -0.001351246 0.002966344
all(as.data.frame(predicted)$Weight == calculated)
## [1] FALSE
predicted[, 1] / calculated ## off by a constant
## 1 2 3 4 5 6 7 8
## 441.9141 441.9141 441.9141 441.9141 441.9141 441.9141 441.9141 441.9141
## 9 10 11 12 13 14 15 16
## 441.9141 441.9141 441.9141 441.9141 441.9141 441.9141 441.9141 441.9141
## 17 18 19 20
## 441.9141 441.9141 441.9141 441.9141
# for fit
predicted[, 2] / (scaled[, 2] * fit$coefficients[3])
## 1 2 3 4 5 6 7 8
## 643.5714 643.5714 643.5714 643.5714 643.5714 643.5714 643.5714 643.5714
## 9 10 11 12 13 14 15 16
## 643.5714 643.5714 643.5714 643.5714 643.5714 643.5714 643.5714 643.5714
## 17 18 19 20
## 643.5714 643.5714 643.5714 643.5714
The values are off by a constant, which is the scale value
attr(scaled, 'scaled:scale')
## Weight RPM
## 441.9141 643.5714