Skip to content

Instantly share code, notes, and snippets.

@DonatasD
Created November 26, 2015 14:45
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save DonatasD/95ded26d746ed2700f06 to your computer and use it in GitHub Desktop.
Save DonatasD/95ded26d746ed2700f06 to your computer and use it in GitHub Desktop.
R
---
title: "temp"
author: "Donatas Daubaras"
date: "November 26, 2015"
output: pdf_document
---
```{r echo=FALSE,cache=TRUE, results="hide", messgae}
library(ElemStatLearn)
library(rjags)
library(pls)
library(mice)
# Change Na values to predictions
predictNAValues = function(data) {
data=complete(mice(data, m=5, printFlag = FALSE),action=1 , inc=FALSE)
return(data)
}
# Alternative encodings
encode = function(data) {
convertAge = function(x) {
switch(x, "1" = 15.5, "2" = 21, "3" = 29.5, "4" = 39.5,
"5" = 49.5, "6" = 59.5, "7" = 69.5)
}
convertLived = function(x) {
switch(x, "1" = 0.5, "2" = 2, "3" = 5, "4" = 8.5,
"5" = 15)
}
mappedAge=sapply(data$Age, FUN = convertAge)
mappedLived=sapply(data$Lived, FUN = convertLived)
data$Age=mappedAge
data$Lived=mappedLived
return(data)
}
# Factorise data
factorise = function(data) {
data$Ethnic = factor(data$Ethnic)
data$Language = factor(data$Language)
data$Sex = factor(data$Sex)
data$Marital = factor(data$Marital)
data$Occupation = factor(data$Occupation)
data$Dual_Income = factor(data$Dual_Income)
data$Status = factor(data$Status)
data$Home_Type = factor(data$Home_Type)
return(data)
}
# Filter data
marketing = predictNAValues(marketing)
marketing = encode(marketing)
marketing = factorise(marketing)
# Freq Analysis
```
```{r echo=FALSE, cache=TRUE}
library(glmnet)
library(leaps)
plotSubset = function(regfit.summary) {
#Plot RSS
op = par(mfrow = c(2,2))
plot(regfit.summary$rss, xlab = "Number of variables", ylab="RSS", type="l")
#Plot adjusted RS^2 and mark its maximum
plot(regfit.summary$adjr2, xlab = "Number of variables", ylab="Adjusted RS^2", type="l")
max = which.max(regfit.summary$adjr2)
points(max, regfit.summary$adjr2[max], col="red", cex=2, pch=20)
#Plot CP and mark its minimum point
plot(regfit.summary$cp, xlab= "Number of variables", ylab="CP", type="l")
min = which.min(regfit.summary$cp)
points(min, regfit.summary$cp[min], col="red", cex=2,pch=20)
#Plot BIC and mark its minimum
plot(regfit.summary$bic, xlab="Number of variables", ylab="BIC", type="l")
min = which.min(regfit.summary$bic)
points(min, regfit.summary$bic[min], col="red", cex=2,pch=20)
par(op)
}
plotSubsetWithPredictors = function(reg) {
op = par(mfrow=c(2,2))
plot(reg, scale ="r2")
plot(reg, scale ="adjr2")
plot(reg, scale ="Cp")
plot(reg, scale ="bic")
par(op)
}
predict.regsubsets=function(object,newdata,id,...){
form=as.formula(object$call[[2]])
mat=model.matrix(form,newdata)
coefi=coef(object,id=id)
xvars=names(coefi)
mat[,xvars]%*%coefi
}
```
##BEST SUBSET
Variable selection using best subset
```{r cache=TRUE, echo=FALSE}
regfit.best=regsubsets(Income~., data=marketing, nvmax=30)
regfit.summary=summary(regfit.best)
```
```{r echo=FALSE}
plotSubset(regfit.summary)
```
Use training and test data to check how well this model performs
```{r cache=TRUE, echo=FALSE}
n=nrow(marketing)
y=marketing$Income
train=sample(1:n, n/2)
test=(1:n)[-train]
mod=regsubsets(Income~., data=marketing[train,], nvmax=30)
summ = summary(regfit.best)
values = c(which.min(summ$cp), which.min(summ$bic), which.max(summ$adjr2))
print(paste("CP: ",values[1]))
print(paste("BIC: ",values[2]))
print(paste("ADJR2: ",values[3]))
minValue = min(values)
pred=predict(mod, marketing[test,], minValue)
summary(mod)$which[minValue,]
mean((y[test]-pred)^2)
```
Make a linear model from the variables that were selected
```{r cache=TRUE, echo=FALSE}
temp = model.matrix(Income ~ ., data=marketing)
x = temp[,summary(mod)$which[minValue,]]
x = as.matrix(x[,-1])
y = marketing$Income
lm(y~x)
```
##LASSO
Variable selection using Lasso:
```{r cache=TRUE,echo=FALSE, message="hide"}
library(glmnet)
y=marketing[,1]
grid=10^seq(10,-5,length=100)
xmat=model.matrix(marketing$Income~.,data=marketing)[,-1]
mod=glmnet(xmat,y,alpha=1,lambda=grid)
cvout=cv.glmnet(xmat,y,alpha=1)
bestlam=cvout$lambda.min
lasso.coef=predict(mod,type="coefficients",s=bestlam)
lasso.coef
```
Model performance
```{r cache=TRUE,echo=FALSE}
xmat=model.matrix(Income~.,data=marketing[train,])[,-1]
mod=glmnet(xmat,y[train],alpha=1,lambda=grid)
cvout=cv.glmnet(xmat,y[train],alpha=1)
bestlam=cvout$lambda.min
lasso.coef=predict(mod,type="coefficients",s=bestlam)
pred=predict(mod,model.matrix(Income~.,data=marketing[test,])[,-1],s=bestlam)
mean((y[test]-pred)^2)
```
## Ridge
Variable selection using Ridge
```{r cache=TRUE, echo=FALSE}
xmat=model.matrix(Income~.,data=marketing)[,-1]
mod=glmnet(xmat,y,alpha=0,lambda=grid)
cvout=cv.glmnet(xmat,y,alpha=0)
bestlam=cvout$lambda.min
ridge.coef=predict(mod,type="coefficients",s=bestlam)
ridge.coef
```
Model performance
```{r cache=TRUE,echo=FALSE}
xmat=model.matrix(Income~.,data=marketing[train,])[,-1]
mod=glmnet(xmat,y[train],alpha=0,lambda=grid)
cvout=cv.glmnet(xmat,y[train],alpha=0)
bestlam=cvout$lambda.min
ridge.coef=predict(mod,type="coefficients",s=bestlam)
pred=predict(mod,model.matrix(Income~.,data=marketing[test,])[,-1],s=bestlam)
mean((y[test]-pred)^2)
```
## PCR
Number of Component selection
```{r cache=TRUE, echo=FALSE}
mod=pcr(Income~., data=marketing, validation="CV")
summary(mod)
plot(mod)
validationplot(mod,val.type="MSEP")
```
Test performance
```{r cache=TRUE, echo=FALSE}
modTrain=pcr(Income~., data=marketing[train,], validation="CV")
pred=predict(mod,newdata=marketing[test,],ncomp=9)
mean((y[test]-pred)^2)
```
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment