chendaniely/predict_type.md

## predict_type.md

      
    Raw
  

              predict_type.md
            
          
  title
  output
  editor_options
  
  
  Predict type = 'terms'
  
  
  html_document
  
  
  keep_md
  toc
  
  
  true
  true
  
  
  chunk_output_type
  
  
  console
  
  
my_cars <- MASS::Cars93[, c('Price', 'Weight', 'RPM')]
head(my_cars)
##   Price Weight  RPM
## 1  15.9   2705 6300
## 2  33.9   3560 5500
## 3  29.1   3375 5500
## 4  37.7   3405 5500
## 5  30.0   3640 5700
## 6  15.7   2880 5200

lm

fit <- lm(Price ~ Weight + RPM, data = my_cars)
coef(fit)
##   (Intercept)        Weight           RPM 
## -48.686662750   0.012930622   0.005389832

When you use 2 variables you'll see that predict will create a single vector of new predicted values (that makes sense).
a <- predict(fit, newdata = my_cars,type = "response")
str(a)
##  Named num [1:93] 20.2 27 24.6 25 29.1 ...
##  - attr(*, "names")= chr [1:93] "1" "2" "3" "4" ...

When you use type='terms' you instead get the individual predictions for each variable in the model.
b <- predict(fit, newdata = my_cars,type = "terms")
str(b)
##  num [1:93, 1:2] -4.76 6.3 3.91 4.29 7.33 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:93] "1" "2" "3" "4" ...
##   ..$ : chr [1:2] "Weight" "RPM"
##  - attr(*, "constant")= num 19.5

head(b)
##      Weight        RPM
## 1 -4.757218  5.4941512
## 2  6.298464  1.1822857
## 3  3.906299  1.1822857
## 4  4.294218  1.1822857
## 5  7.332914  2.2602521
## 6 -2.494359 -0.4346639

attr(b, 'constant')
## [1] 19.50968

You can actually get the response values back by rowSum the matrix (from type='terms') and then adding in the constant.
Which is found in the 'constant' attribute (note this is not the same as the model intercept).
my_cars$terms_calculation <- rowSums(b) + attr(b, 'constant')
my_cars$response_values <- a

head(my_cars)
##   Price Weight  RPM terms_calculation response_values
## 1  15.9   2705 6300          20.24661        20.24661
## 2  33.9   3560 5500          26.99043        26.99043
## 3  29.1   3375 5500          24.59826        24.59826
## 4  37.7   3405 5500          24.98618        24.98618
## 5  30.0   3640 5700          29.10284        29.10284
## 6  15.7   2880 5200          16.58065        16.58065

Minimal example

How are these values calulated?
It seems like if we scale our data we can reproduce the numbers
# create data
set.seed(42)
n = 20
x = rnorm(n)
y = 2*x + 1 + rnorm(n)

# fit model
lm1 = lm(y ~ x)

# scale the x
x_scale = scale(x, scale = FALSE)

# predict on scaled values with type=terms
predicted <- predict(lm1, type = 'terms')
predicted
##             x
## 1   2.7935936
## 2  -1.7927182
## 3   0.4056582
## 4   1.0447618
## 5   0.5031345
## 6  -0.7061816
## 7   3.1266424
## 8  -0.6790155
## 9   4.3276867
## 10 -0.6033257
## 11  2.6370039
## 12  4.9632066
## 13 -3.7454749
## 14 -1.1152894
## 15 -0.7706213
## 16  1.0520780
## 17 -1.1282361
## 18 -6.7488922
## 19 -6.2371328
## 20  2.6731220
## attr(,"constant")
## [1] 1.112848

The predicted values (type='terms') is the scale values * the model coefficients
# note i am not including the intercept here
calculated <- x_scale * lm1$coefficients[2]
calculated
##             [,1]
##  [1,]  2.7935936
##  [2,] -1.7927182
##  [3,]  0.4056582
##  [4,]  1.0447618
##  [5,]  0.5031345
##  [6,] -0.7061816
##  [7,]  3.1266424
##  [8,] -0.6790155
##  [9,]  4.3276867
## [10,] -0.6033257
## [11,]  2.6370039
## [12,]  4.9632066
## [13,] -3.7454749
## [14,] -1.1152894
## [15,] -0.7706213
## [16,]  1.0520780
## [17,] -1.1282361
## [18,] -6.7488922
## [19,] -6.2371328
## [20,]  2.6731220
## attr(,"scaled:center")
## [1] 0.19192

all(as.data.frame(predicted) == calculated)
## [1] TRUE

The constant: original intercept + sum of all the betas times the colmeans of the x's
calculated_constant <- lm1$coefficients[1] + lm1$coefficients[2] * mean(x)
calculated_constant
## (Intercept) 
##    1.112848

calculated_constant == attr(predicted, 'constant')
## (Intercept) 
##        TRUE

Cars example

If we take the above process and apply it to the cars93 dataset and fit it with 2 predictors,
we can't fully reproduce the results.
my_cars <- MASS::Cars93[1:20, c('Price', 'Weight', 'RPM')]
fit <- lm(Price ~ Weight + RPM, data = my_cars)

# scale the weight
scaled <- scale(my_cars[-1])

predicted <- predict(fit, type = 'terms')
predicted
##         Weight        RPM
## 1  -10.1371543  9.2836501
## 2    1.9468707  3.3657855
## 3   -0.6678014  3.3657855
## 4   -0.2438005  3.3657855
## 5    3.0775397  4.8452516
## 6   -7.6638159  1.1465863
## 7    0.6748681 -1.8123460
## 8    9.6495533 -7.7302106
## 9    1.0282021 -1.8123460
## 10   2.7948725 -6.9904775
## 11   7.2468817  7.0644509
## 12 -13.1758273  1.1465863
## 13  -9.0064853  1.1465863
## 14  -2.5758053 -3.2918122
## 15  -3.2118067  1.1465863
## 16   4.1375419 -1.8123460
## 17   8.5188843 -7.7302106
## 18   6.8935476 -6.2507445
## 19  -0.5971346 -0.3328799
## 20   1.3108694  1.8863193
## attr(,"constant")
## [1] 23.59

calculated <- scaled[, 1] * fit$coefficients[2]
calculated
##            1            2            3            4            5 
## -0.022939195  0.004405541 -0.001511156 -0.000551692  0.006964113 
##            6            7            8            9           10 
## -0.017342319  0.001527148  0.021835811  0.002326701  0.006324470 
##           11           12           13           14           15 
##  0.016398846 -0.029815357 -0.020380623 -0.005828746 -0.007267943 
##           16           17           18           19           20 
##  0.009362774  0.019277239  0.015599292 -0.001351246  0.002966344

all(as.data.frame(predicted)$Weight == calculated)
## [1] FALSE

For some reason we're off by a constant
predicted[, 1] / calculated ## off by a constant
##        1        2        3        4        5        6        7        8 
## 441.9141 441.9141 441.9141 441.9141 441.9141 441.9141 441.9141 441.9141 
##        9       10       11       12       13       14       15       16 
## 441.9141 441.9141 441.9141 441.9141 441.9141 441.9141 441.9141 441.9141 
##       17       18       19       20 
## 441.9141 441.9141 441.9141 441.9141

# for fit
predicted[, 2] / (scaled[, 2] * fit$coefficients[3])
##        1        2        3        4        5        6        7        8 
## 643.5714 643.5714 643.5714 643.5714 643.5714 643.5714 643.5714 643.5714 
##        9       10       11       12       13       14       15       16 
## 643.5714 643.5714 643.5714 643.5714 643.5714 643.5714 643.5714 643.5714 
##       17       18       19       20 
## 643.5714 643.5714 643.5714 643.5714

The values are off by a constant, which is the scale value
attr(scaled, 'scaled:scale')
##   Weight      RPM 
## 441.9141 643.5714

So the questoin is, how are these values calculated?
glm -- same behaviour

Minimal example

# create data
set.seed(42)
n = 20
x = rnorm(n)
y = 2*x + 1 + rnorm(n)

# fit model
glm1 = glm(y ~ x)

# scale the x
x_scale = scale(x, scale = FALSE)

# predict on scaled values with type=terms
predicted <- predict(glm1, type = 'terms')
predicted
##             x
## 1   2.7935936
## 2  -1.7927182
## 3   0.4056582
## 4   1.0447618
## 5   0.5031345
## 6  -0.7061816
## 7   3.1266424
## 8  -0.6790155
## 9   4.3276867
## 10 -0.6033257
## 11  2.6370039
## 12  4.9632066
## 13 -3.7454749
## 14 -1.1152894
## 15 -0.7706213
## 16  1.0520780
## 17 -1.1282361
## 18 -6.7488922
## 19 -6.2371328
## 20  2.6731220
## attr(,"constant")
## [1] 1.112848

calculated <- x_scale * glm1$coefficients[2]
calculated
##             [,1]
##  [1,]  2.7935936
##  [2,] -1.7927182
##  [3,]  0.4056582
##  [4,]  1.0447618
##  [5,]  0.5031345
##  [6,] -0.7061816
##  [7,]  3.1266424
##  [8,] -0.6790155
##  [9,]  4.3276867
## [10,] -0.6033257
## [11,]  2.6370039
## [12,]  4.9632066
## [13,] -3.7454749
## [14,] -1.1152894
## [15,] -0.7706213
## [16,]  1.0520780
## [17,] -1.1282361
## [18,] -6.7488922
## [19,] -6.2371328
## [20,]  2.6731220
## attr(,"scaled:center")
## [1] 0.19192

all(as.data.frame(predicted) == calculated)
## [1] TRUE

# the constant:
# original intercept + sum of all the betas times the colmeans of the x's
calculated_constant <- glm1$coefficients[1] + glm1$coefficients[2] * mean(x)
calculated_constant
## (Intercept) 
##    1.112848

calculated_constant == attr(predicted, 'constant')
## (Intercept) 
##        TRUE

Cars example

my_cars <- MASS::Cars93[1:20, c('Price', 'Weight', 'RPM')]
fit <- glm(Price ~ Weight + RPM, data = my_cars)

# scale the weight
scaled <- scale(my_cars[-1])

predicted <- predict(fit, type = 'terms')
predicted
##         Weight        RPM
## 1  -10.1371543  9.2836501
## 2    1.9468707  3.3657855
## 3   -0.6678014  3.3657855
## 4   -0.2438005  3.3657855
## 5    3.0775397  4.8452516
## 6   -7.6638159  1.1465863
## 7    0.6748681 -1.8123460
## 8    9.6495533 -7.7302106
## 9    1.0282021 -1.8123460
## 10   2.7948725 -6.9904775
## 11   7.2468817  7.0644509
## 12 -13.1758273  1.1465863
## 13  -9.0064853  1.1465863
## 14  -2.5758053 -3.2918122
## 15  -3.2118067  1.1465863
## 16   4.1375419 -1.8123460
## 17   8.5188843 -7.7302106
## 18   6.8935476 -6.2507445
## 19  -0.5971346 -0.3328799
## 20   1.3108694  1.8863193
## attr(,"constant")
## [1] 23.59

calculated <- scaled[, 1] * fit$coefficients[2]
calculated
##            1            2            3            4            5 
## -0.022939195  0.004405541 -0.001511156 -0.000551692  0.006964113 
##            6            7            8            9           10 
## -0.017342319  0.001527148  0.021835811  0.002326701  0.006324470 
##           11           12           13           14           15 
##  0.016398846 -0.029815357 -0.020380623 -0.005828746 -0.007267943 
##           16           17           18           19           20 
##  0.009362774  0.019277239  0.015599292 -0.001351246  0.002966344

all(as.data.frame(predicted)$Weight == calculated)
## [1] FALSE

predicted[, 1] / calculated ## off by a constant
##        1        2        3        4        5        6        7        8 
## 441.9141 441.9141 441.9141 441.9141 441.9141 441.9141 441.9141 441.9141 
##        9       10       11       12       13       14       15       16 
## 441.9141 441.9141 441.9141 441.9141 441.9141 441.9141 441.9141 441.9141 
##       17       18       19       20 
## 441.9141 441.9141 441.9141 441.9141

# for fit
predicted[, 2] / (scaled[, 2] * fit$coefficients[3])
##        1        2        3        4        5        6        7        8 
## 643.5714 643.5714 643.5714 643.5714 643.5714 643.5714 643.5714 643.5714 
##        9       10       11       12       13       14       15       16 
## 643.5714 643.5714 643.5714 643.5714 643.5714 643.5714 643.5714 643.5714 
##       17       18       19       20 
## 643.5714 643.5714 643.5714 643.5714

The values are off by a constant, which is the scale value
attr(scaled, 'scaled:scale')
##   Weight      RPM 
## 441.9141 643.5714