stephenturner/RF vs lm and FS 1.r

## RF vs lm and FS 1.r
> #First define the function
> rsq <- function (yhat, y) 1 - sum((y - yhat)^2)/sum((y - mean(y))^2)
>
> # first, fit a stepwise model and test it on new data
> totfat1.intonly <- lm(CRC_FAT_TOT~1, data=totfat1)
> totfat1.full <- lm(CRC_FAT_TOT~., data=totfat1)
> totfat1.step <- step(totfat1.intonly, scope=list(upper=totfat1.full), trace=0)
> rsq(predict(totfat1.step,totfat2), totfat2$CRC_FAT_TOT)
[1] 0.6991431
>
> # Fit and test the RF model
> set.seed(14)
> totfat.rf.split12 <- randomForest(
+ x=totfat1[ ,-1], #exclude the first column, the outcome
+ y=totfat1[ , 1], #include only the first column, the outcome
+ xtest=totfat2[ ,-1],
+ ytest=totfat2[ , 1],
+ keep.forest=TRUE,
+ importance=TRUE,
+ )
> print(totfat.rf.split12)

Call:
 randomForest(x = totfat1[, -1], y = totfat1[, 1], xtest = totfat2[,      -1], ytest = totfat2[, 1], importance = TRUE, keep.forest = TRUE)
               Type of random forest: regression
                     Number of trees: 500
No. of variables tried at each split: 17

          Mean of squared residuals: 24052766
                    % Var explained: 69.34
                       Test set MSE: 18126592
                    % Var explained: 80.51
> rsq(predict(totfat.rf.split12, totfat2), totfat2[ ,1])
[1] 0.799995
>
> # now fit a linear model for total fat with the important variables
> totfat1.lm <- lm(CRC_FAT_TOT ~ CRC_TECH_BMI+ALSR_Leptin+lep_adipo, data=totfat1)
>
> #how well does that model explain the data?
> summary(totfat1.lm)

Call:
lm(formula = CRC_FAT_TOT ~ CRC_TECH_BMI + ALSR_Leptin + lep_adipo,
    data = totfat1)

Residuals:
    Min      1Q  Median      3Q     Max
-5565.7 -1381.8   249.6  1535.7  4905.7

Coefficients:
             Estimate Std. Error t value Pr(>|t|)
(Intercept)   -7532.9     3526.6  -2.136   0.0395
CRC_TECH_BMI   1100.5      165.3   6.658 9.27e-08
ALSR_Leptin     255.0       54.7   4.662 4.20e-05
lep_adipo      -128.6      106.3  -1.210   0.2342

Residual standard error: 2623 on 36 degrees of freedom
Multiple R-squared: 0.9211,     Adjusted R-squared: 0.9145
F-statistic: 140.1 on 3 and 36 DF,  p-value: < 2.2e-16

>
> #how much variation in new data does the model explain?
> rsq(predict(totfat1.lm, totfat2), totfat2$CRC_FAT_TOT)
[1] 0.9011825
>
> # what if i were to restrict the random forest procedure to only allow
> # the variables that i found were important modeling the whole dataset?
> set.seed(42)
> totfat1.rf.restricted <- randomForest(
+ CRC_FAT_TOT~ CRC_TECH_BMI+ALSR_Leptin+lep_adipo,
+ data=totfat1, keep.forest=TRUE, importance=TRUE)
> rsq(predict(totfat1.rf.restricted,totfat2),totfat2$CRC_FAT_TOT)
[1] 0.8820904
	> #First define the function
	> rsq <- function (yhat, y) 1 - sum((y - yhat)^2)/sum((y - mean(y))^2)
	>
	> # first, fit a stepwise model and test it on new data
	> totfat1.intonly <- lm(CRC_FAT_TOT~1, data=totfat1)
	> totfat1.full <- lm(CRC_FAT_TOT~., data=totfat1)
	> totfat1.step <- step(totfat1.intonly, scope=list(upper=totfat1.full), trace=0)
	> rsq(predict(totfat1.step,totfat2), totfat2$CRC_FAT_TOT)
	[1] 0.6991431
	>
	> # Fit and test the RF model
	> set.seed(14)
	> totfat.rf.split12 <- randomForest(
	+ x=totfat1[ ,-1], #exclude the first column, the outcome
	+ y=totfat1[ , 1], #include only the first column, the outcome
	+ xtest=totfat2[ ,-1],
	+ ytest=totfat2[ , 1],
	+ keep.forest=TRUE,
	+ importance=TRUE,
	+ )
	> print(totfat.rf.split12)

	Call:
	randomForest(x = totfat1[, -1], y = totfat1[, 1], xtest = totfat2[, -1], ytest = totfat2[, 1], importance = TRUE, keep.forest = TRUE)
	Type of random forest: regression
	Number of trees: 500
	No. of variables tried at each split: 17

	Mean of squared residuals: 24052766
	% Var explained: 69.34
	Test set MSE: 18126592
	% Var explained: 80.51
	> rsq(predict(totfat.rf.split12, totfat2), totfat2[ ,1])
	[1] 0.799995
	>
	> # now fit a linear model for total fat with the important variables
	> totfat1.lm <- lm(CRC_FAT_TOT ~ CRC_TECH_BMI+ALSR_Leptin+lep_adipo, data=totfat1)
	>
	> #how well does that model explain the data?
	> summary(totfat1.lm)

	Call:
	lm(formula = CRC_FAT_TOT ~ CRC_TECH_BMI + ALSR_Leptin + lep_adipo,
	data = totfat1)

	Residuals:
	Min 1Q Median 3Q Max
	-5565.7 -1381.8 249.6 1535.7 4905.7

	Coefficients:
	Estimate Std. Error t value Pr(>\|t\|)
	(Intercept) -7532.9 3526.6 -2.136 0.0395
	CRC_TECH_BMI 1100.5 165.3 6.658 9.27e-08
	ALSR_Leptin 255.0 54.7 4.662 4.20e-05
	lep_adipo -128.6 106.3 -1.210 0.2342

	Residual standard error: 2623 on 36 degrees of freedom
	Multiple R-squared: 0.9211, Adjusted R-squared: 0.9145
	F-statistic: 140.1 on 3 and 36 DF, p-value: < 2.2e-16

	>
	> #how much variation in new data does the model explain?
	> rsq(predict(totfat1.lm, totfat2), totfat2$CRC_FAT_TOT)
	[1] 0.9011825
	>
	> # what if i were to restrict the random forest procedure to only allow
	> # the variables that i found were important modeling the whole dataset?
	> set.seed(42)
	> totfat1.rf.restricted <- randomForest(
	+ CRC_FAT_TOT~ CRC_TECH_BMI+ALSR_Leptin+lep_adipo,
	+ data=totfat1, keep.forest=TRUE, importance=TRUE)
	> rsq(predict(totfat1.rf.restricted,totfat2),totfat2$CRC_FAT_TOT)
	[1] 0.8820904