Created
November 26, 2015 14:45
-
-
Save DonatasD/95ded26d746ed2700f06 to your computer and use it in GitHub Desktop.
R
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- | |
title: "temp" | |
author: "Donatas Daubaras" | |
date: "November 26, 2015" | |
output: pdf_document | |
--- | |
```{r echo=FALSE,cache=TRUE, results="hide", messgae} | |
library(ElemStatLearn) | |
library(rjags) | |
library(pls) | |
library(mice) | |
# Change Na values to predictions | |
predictNAValues = function(data) { | |
data=complete(mice(data, m=5, printFlag = FALSE),action=1 , inc=FALSE) | |
return(data) | |
} | |
# Alternative encodings | |
encode = function(data) { | |
convertAge = function(x) { | |
switch(x, "1" = 15.5, "2" = 21, "3" = 29.5, "4" = 39.5, | |
"5" = 49.5, "6" = 59.5, "7" = 69.5) | |
} | |
convertLived = function(x) { | |
switch(x, "1" = 0.5, "2" = 2, "3" = 5, "4" = 8.5, | |
"5" = 15) | |
} | |
mappedAge=sapply(data$Age, FUN = convertAge) | |
mappedLived=sapply(data$Lived, FUN = convertLived) | |
data$Age=mappedAge | |
data$Lived=mappedLived | |
return(data) | |
} | |
# Factorise data | |
factorise = function(data) { | |
data$Ethnic = factor(data$Ethnic) | |
data$Language = factor(data$Language) | |
data$Sex = factor(data$Sex) | |
data$Marital = factor(data$Marital) | |
data$Occupation = factor(data$Occupation) | |
data$Dual_Income = factor(data$Dual_Income) | |
data$Status = factor(data$Status) | |
data$Home_Type = factor(data$Home_Type) | |
return(data) | |
} | |
# Filter data | |
marketing = predictNAValues(marketing) | |
marketing = encode(marketing) | |
marketing = factorise(marketing) | |
# Freq Analysis | |
``` | |
```{r echo=FALSE, cache=TRUE} | |
library(glmnet) | |
library(leaps) | |
plotSubset = function(regfit.summary) { | |
#Plot RSS | |
op = par(mfrow = c(2,2)) | |
plot(regfit.summary$rss, xlab = "Number of variables", ylab="RSS", type="l") | |
#Plot adjusted RS^2 and mark its maximum | |
plot(regfit.summary$adjr2, xlab = "Number of variables", ylab="Adjusted RS^2", type="l") | |
max = which.max(regfit.summary$adjr2) | |
points(max, regfit.summary$adjr2[max], col="red", cex=2, pch=20) | |
#Plot CP and mark its minimum point | |
plot(regfit.summary$cp, xlab= "Number of variables", ylab="CP", type="l") | |
min = which.min(regfit.summary$cp) | |
points(min, regfit.summary$cp[min], col="red", cex=2,pch=20) | |
#Plot BIC and mark its minimum | |
plot(regfit.summary$bic, xlab="Number of variables", ylab="BIC", type="l") | |
min = which.min(regfit.summary$bic) | |
points(min, regfit.summary$bic[min], col="red", cex=2,pch=20) | |
par(op) | |
} | |
plotSubsetWithPredictors = function(reg) { | |
op = par(mfrow=c(2,2)) | |
plot(reg, scale ="r2") | |
plot(reg, scale ="adjr2") | |
plot(reg, scale ="Cp") | |
plot(reg, scale ="bic") | |
par(op) | |
} | |
predict.regsubsets=function(object,newdata,id,...){ | |
form=as.formula(object$call[[2]]) | |
mat=model.matrix(form,newdata) | |
coefi=coef(object,id=id) | |
xvars=names(coefi) | |
mat[,xvars]%*%coefi | |
} | |
``` | |
##BEST SUBSET | |
Variable selection using best subset | |
```{r cache=TRUE, echo=FALSE} | |
regfit.best=regsubsets(Income~., data=marketing, nvmax=30) | |
regfit.summary=summary(regfit.best) | |
``` | |
```{r echo=FALSE} | |
plotSubset(regfit.summary) | |
``` | |
Use training and test data to check how well this model performs | |
```{r cache=TRUE, echo=FALSE} | |
n=nrow(marketing) | |
y=marketing$Income | |
train=sample(1:n, n/2) | |
test=(1:n)[-train] | |
mod=regsubsets(Income~., data=marketing[train,], nvmax=30) | |
summ = summary(regfit.best) | |
values = c(which.min(summ$cp), which.min(summ$bic), which.max(summ$adjr2)) | |
print(paste("CP: ",values[1])) | |
print(paste("BIC: ",values[2])) | |
print(paste("ADJR2: ",values[3])) | |
minValue = min(values) | |
pred=predict(mod, marketing[test,], minValue) | |
summary(mod)$which[minValue,] | |
mean((y[test]-pred)^2) | |
``` | |
Make a linear model from the variables that were selected | |
```{r cache=TRUE, echo=FALSE} | |
temp = model.matrix(Income ~ ., data=marketing) | |
x = temp[,summary(mod)$which[minValue,]] | |
x = as.matrix(x[,-1]) | |
y = marketing$Income | |
lm(y~x) | |
``` | |
##LASSO | |
Variable selection using Lasso: | |
```{r cache=TRUE,echo=FALSE, message="hide"} | |
library(glmnet) | |
y=marketing[,1] | |
grid=10^seq(10,-5,length=100) | |
xmat=model.matrix(marketing$Income~.,data=marketing)[,-1] | |
mod=glmnet(xmat,y,alpha=1,lambda=grid) | |
cvout=cv.glmnet(xmat,y,alpha=1) | |
bestlam=cvout$lambda.min | |
lasso.coef=predict(mod,type="coefficients",s=bestlam) | |
lasso.coef | |
``` | |
Model performance | |
```{r cache=TRUE,echo=FALSE} | |
xmat=model.matrix(Income~.,data=marketing[train,])[,-1] | |
mod=glmnet(xmat,y[train],alpha=1,lambda=grid) | |
cvout=cv.glmnet(xmat,y[train],alpha=1) | |
bestlam=cvout$lambda.min | |
lasso.coef=predict(mod,type="coefficients",s=bestlam) | |
pred=predict(mod,model.matrix(Income~.,data=marketing[test,])[,-1],s=bestlam) | |
mean((y[test]-pred)^2) | |
``` | |
## Ridge | |
Variable selection using Ridge | |
```{r cache=TRUE, echo=FALSE} | |
xmat=model.matrix(Income~.,data=marketing)[,-1] | |
mod=glmnet(xmat,y,alpha=0,lambda=grid) | |
cvout=cv.glmnet(xmat,y,alpha=0) | |
bestlam=cvout$lambda.min | |
ridge.coef=predict(mod,type="coefficients",s=bestlam) | |
ridge.coef | |
``` | |
Model performance | |
```{r cache=TRUE,echo=FALSE} | |
xmat=model.matrix(Income~.,data=marketing[train,])[,-1] | |
mod=glmnet(xmat,y[train],alpha=0,lambda=grid) | |
cvout=cv.glmnet(xmat,y[train],alpha=0) | |
bestlam=cvout$lambda.min | |
ridge.coef=predict(mod,type="coefficients",s=bestlam) | |
pred=predict(mod,model.matrix(Income~.,data=marketing[test,])[,-1],s=bestlam) | |
mean((y[test]-pred)^2) | |
``` | |
## PCR | |
Number of Component selection | |
```{r cache=TRUE, echo=FALSE} | |
mod=pcr(Income~., data=marketing, validation="CV") | |
summary(mod) | |
plot(mod) | |
validationplot(mod,val.type="MSEP") | |
``` | |
Test performance | |
```{r cache=TRUE, echo=FALSE} | |
modTrain=pcr(Income~., data=marketing[train,], validation="CV") | |
pred=predict(mod,newdata=marketing[test,],ncomp=9) | |
mean((y[test]-pred)^2) | |
``` |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment