DonatasD/Project

## Project
---
title: "temp"
author: "Donatas Daubaras"
date: "November 26, 2015"
output: pdf_document
---

```{r echo=FALSE,cache=TRUE, results="hide", messgae}
library(ElemStatLearn)
library(rjags)
library(pls)

library(mice)

# Change Na values to predictions
predictNAValues = function(data) {
  data=complete(mice(data, m=5, printFlag = FALSE),action=1 , inc=FALSE)
  return(data)
}

# Alternative encodings
encode = function(data) {
  convertAge = function(x) {
    switch(x, "1" = 15.5, "2" = 21, "3" = 29.5, "4" = 39.5,
           "5" = 49.5, "6" = 59.5, "7" = 69.5)
  }
  convertLived = function(x) {
    switch(x, "1" = 0.5, "2" = 2, "3" = 5, "4" = 8.5,
           "5" = 15)
  }
  mappedAge=sapply(data$Age, FUN = convertAge)
  mappedLived=sapply(data$Lived, FUN = convertLived)
  data$Age=mappedAge
  data$Lived=mappedLived
  return(data)
}

# Factorise data
factorise = function(data) {
  data$Ethnic = factor(data$Ethnic)
  data$Language = factor(data$Language)
  data$Sex = factor(data$Sex)
  data$Marital = factor(data$Marital)
  data$Occupation = factor(data$Occupation)
  data$Dual_Income = factor(data$Dual_Income)
  data$Status = factor(data$Status)
  data$Home_Type = factor(data$Home_Type)
  return(data)
}

# Filter data
marketing = predictNAValues(marketing)
marketing = encode(marketing)
marketing = factorise(marketing)
# Freq Analysis
```

```{r echo=FALSE, cache=TRUE}
library(glmnet)
library(leaps)

plotSubset = function(regfit.summary) {
  #Plot RSS
  op = par(mfrow = c(2,2))
  plot(regfit.summary$rss, xlab = "Number of variables", ylab="RSS", type="l")

  #Plot adjusted RS^2 and mark its maximum
  plot(regfit.summary$adjr2, xlab = "Number of variables", ylab="Adjusted RS^2", type="l")
  max = which.max(regfit.summary$adjr2)
  points(max, regfit.summary$adjr2[max], col="red", cex=2, pch=20)

  #Plot CP and mark its minimum point
  plot(regfit.summary$cp, xlab= "Number of variables", ylab="CP", type="l")
  min = which.min(regfit.summary$cp)
  points(min, regfit.summary$cp[min], col="red", cex=2,pch=20)

  #Plot BIC and mark its minimum
  plot(regfit.summary$bic, xlab="Number of variables", ylab="BIC", type="l")
  min = which.min(regfit.summary$bic)
  points(min, regfit.summary$bic[min], col="red", cex=2,pch=20)

  par(op)
}

plotSubsetWithPredictors = function(reg) {
  op = par(mfrow=c(2,2))
  plot(reg, scale ="r2")
  plot(reg, scale ="adjr2")
  plot(reg, scale ="Cp")
  plot(reg, scale ="bic")
  par(op)
}

predict.regsubsets=function(object,newdata,id,...){
  form=as.formula(object$call[[2]])
  mat=model.matrix(form,newdata)
  coefi=coef(object,id=id)
  xvars=names(coefi)
  mat[,xvars]%*%coefi
}
```
##BEST SUBSET
Variable selection using best subset
```{r cache=TRUE, echo=FALSE}
regfit.best=regsubsets(Income~., data=marketing, nvmax=30)
regfit.summary=summary(regfit.best)
```
```{r echo=FALSE}
plotSubset(regfit.summary)
```
Use training and test data to check how well this model performs
```{r cache=TRUE, echo=FALSE}
n=nrow(marketing)
y=marketing$Income
train=sample(1:n, n/2)
test=(1:n)[-train]
mod=regsubsets(Income~., data=marketing[train,], nvmax=30)
summ = summary(regfit.best)
values = c(which.min(summ$cp), which.min(summ$bic), which.max(summ$adjr2))
print(paste("CP: ",values[1]))
print(paste("BIC: ",values[2]))
print(paste("ADJR2: ",values[3]))
minValue = min(values)
pred=predict(mod, marketing[test,], minValue)
summary(mod)$which[minValue,]
mean((y[test]-pred)^2)
```
Make a linear model from the variables that were selected
```{r cache=TRUE, echo=FALSE}
temp = model.matrix(Income ~ ., data=marketing)
x = temp[,summary(mod)$which[minValue,]]
x = as.matrix(x[,-1])
y = marketing$Income
lm(y~x)
```
##LASSO

Variable selection using Lasso:
```{r cache=TRUE,echo=FALSE, message="hide"}
library(glmnet)
y=marketing[,1]
grid=10^seq(10,-5,length=100)
xmat=model.matrix(marketing$Income~.,data=marketing)[,-1]
mod=glmnet(xmat,y,alpha=1,lambda=grid)
cvout=cv.glmnet(xmat,y,alpha=1)
bestlam=cvout$lambda.min
lasso.coef=predict(mod,type="coefficients",s=bestlam)
lasso.coef
```
Model performance
```{r cache=TRUE,echo=FALSE}
xmat=model.matrix(Income~.,data=marketing[train,])[,-1]
mod=glmnet(xmat,y[train],alpha=1,lambda=grid)
cvout=cv.glmnet(xmat,y[train],alpha=1)
bestlam=cvout$lambda.min
lasso.coef=predict(mod,type="coefficients",s=bestlam)
pred=predict(mod,model.matrix(Income~.,data=marketing[test,])[,-1],s=bestlam)
mean((y[test]-pred)^2)
```
## Ridge

Variable selection using Ridge
```{r cache=TRUE, echo=FALSE}
xmat=model.matrix(Income~.,data=marketing)[,-1]
mod=glmnet(xmat,y,alpha=0,lambda=grid)
cvout=cv.glmnet(xmat,y,alpha=0)
bestlam=cvout$lambda.min
ridge.coef=predict(mod,type="coefficients",s=bestlam)
ridge.coef
```
Model performance
```{r cache=TRUE,echo=FALSE}
xmat=model.matrix(Income~.,data=marketing[train,])[,-1]
mod=glmnet(xmat,y[train],alpha=0,lambda=grid)
cvout=cv.glmnet(xmat,y[train],alpha=0)
bestlam=cvout$lambda.min
ridge.coef=predict(mod,type="coefficients",s=bestlam)
pred=predict(mod,model.matrix(Income~.,data=marketing[test,])[,-1],s=bestlam)
mean((y[test]-pred)^2)
```

## PCR

Number of Component selection
```{r cache=TRUE, echo=FALSE}
mod=pcr(Income~., data=marketing, validation="CV")
summary(mod)
plot(mod)
validationplot(mod,val.type="MSEP")
```
Test performance
```{r cache=TRUE, echo=FALSE}
modTrain=pcr(Income~., data=marketing[train,], validation="CV")
pred=predict(mod,newdata=marketing[test,],ncomp=9)
mean((y[test]-pred)^2)
```
	---
	title: "temp"
	author: "Donatas Daubaras"
	date: "November 26, 2015"
	output: pdf_document
	---

	```{r echo=FALSE,cache=TRUE, results="hide", messgae}
	library(ElemStatLearn)
	library(rjags)
	library(pls)

	library(mice)

	# Change Na values to predictions
	predictNAValues = function(data) {
	data=complete(mice(data, m=5, printFlag = FALSE),action=1 , inc=FALSE)
	return(data)
	}

	# Alternative encodings
	encode = function(data) {
	convertAge = function(x) {
	switch(x, "1" = 15.5, "2" = 21, "3" = 29.5, "4" = 39.5,
	"5" = 49.5, "6" = 59.5, "7" = 69.5)
	}
	convertLived = function(x) {
	switch(x, "1" = 0.5, "2" = 2, "3" = 5, "4" = 8.5,
	"5" = 15)
	}
	mappedAge=sapply(data$Age, FUN = convertAge)
	mappedLived=sapply(data$Lived, FUN = convertLived)
	data$Age=mappedAge
	data$Lived=mappedLived
	return(data)
	}

	# Factorise data
	factorise = function(data) {
	data$Ethnic = factor(data$Ethnic)
	data$Language = factor(data$Language)
	data$Sex = factor(data$Sex)
	data$Marital = factor(data$Marital)
	data$Occupation = factor(data$Occupation)
	data$Dual_Income = factor(data$Dual_Income)
	data$Status = factor(data$Status)
	data$Home_Type = factor(data$Home_Type)
	return(data)
	}

	# Filter data
	marketing = predictNAValues(marketing)
	marketing = encode(marketing)
	marketing = factorise(marketing)
	# Freq Analysis
	```

	```{r echo=FALSE, cache=TRUE}
	library(glmnet)
	library(leaps)

	plotSubset = function(regfit.summary) {
	#Plot RSS
	op = par(mfrow = c(2,2))
	plot(regfit.summary$rss, xlab = "Number of variables", ylab="RSS", type="l")

	#Plot adjusted RS^2 and mark its maximum
	plot(regfit.summary$adjr2, xlab = "Number of variables", ylab="Adjusted RS^2", type="l")
	max = which.max(regfit.summary$adjr2)
	points(max, regfit.summary$adjr2[max], col="red", cex=2, pch=20)

	#Plot CP and mark its minimum point
	plot(regfit.summary$cp, xlab= "Number of variables", ylab="CP", type="l")
	min = which.min(regfit.summary$cp)
	points(min, regfit.summary$cp[min], col="red", cex=2,pch=20)

	#Plot BIC and mark its minimum
	plot(regfit.summary$bic, xlab="Number of variables", ylab="BIC", type="l")
	min = which.min(regfit.summary$bic)
	points(min, regfit.summary$bic[min], col="red", cex=2,pch=20)

	par(op)
	}

	plotSubsetWithPredictors = function(reg) {
	op = par(mfrow=c(2,2))
	plot(reg, scale ="r2")
	plot(reg, scale ="adjr2")
	plot(reg, scale ="Cp")
	plot(reg, scale ="bic")
	par(op)
	}

	predict.regsubsets=function(object,newdata,id,...){
	form=as.formula(object$call[[2]])
	mat=model.matrix(form,newdata)
	coefi=coef(object,id=id)
	xvars=names(coefi)
	mat[,xvars]%*%coefi
	}
	```
	##BEST SUBSET
	Variable selection using best subset
	```{r cache=TRUE, echo=FALSE}
	regfit.best=regsubsets(Income~., data=marketing, nvmax=30)
	regfit.summary=summary(regfit.best)
	```
	```{r echo=FALSE}
	plotSubset(regfit.summary)
	```
	Use training and test data to check how well this model performs
	```{r cache=TRUE, echo=FALSE}
	n=nrow(marketing)
	y=marketing$Income
	train=sample(1:n, n/2)
	test=(1:n)[-train]
	mod=regsubsets(Income~., data=marketing[train,], nvmax=30)
	summ = summary(regfit.best)
	values = c(which.min(summ$cp), which.min(summ$bic), which.max(summ$adjr2))
	print(paste("CP: ",values[1]))
	print(paste("BIC: ",values[2]))
	print(paste("ADJR2: ",values[3]))
	minValue = min(values)
	pred=predict(mod, marketing[test,], minValue)
	summary(mod)$which[minValue,]
	mean((y[test]-pred)^2)
	```
	Make a linear model from the variables that were selected
	```{r cache=TRUE, echo=FALSE}
	temp = model.matrix(Income ~ ., data=marketing)
	x = temp[,summary(mod)$which[minValue,]]
	x = as.matrix(x[,-1])
	y = marketing$Income
	lm(y~x)
	```
	##LASSO

	Variable selection using Lasso:
	```{r cache=TRUE,echo=FALSE, message="hide"}
	library(glmnet)
	y=marketing[,1]
	grid=10^seq(10,-5,length=100)
	xmat=model.matrix(marketing$Income~.,data=marketing)[,-1]
	mod=glmnet(xmat,y,alpha=1,lambda=grid)
	cvout=cv.glmnet(xmat,y,alpha=1)
	bestlam=cvout$lambda.min
	lasso.coef=predict(mod,type="coefficients",s=bestlam)
	lasso.coef
	```
	Model performance
	```{r cache=TRUE,echo=FALSE}
	xmat=model.matrix(Income~.,data=marketing[train,])[,-1]
	mod=glmnet(xmat,y[train],alpha=1,lambda=grid)
	cvout=cv.glmnet(xmat,y[train],alpha=1)
	bestlam=cvout$lambda.min
	lasso.coef=predict(mod,type="coefficients",s=bestlam)
	pred=predict(mod,model.matrix(Income~.,data=marketing[test,])[,-1],s=bestlam)
	mean((y[test]-pred)^2)
	```
	## Ridge

	Variable selection using Ridge
	```{r cache=TRUE, echo=FALSE}
	xmat=model.matrix(Income~.,data=marketing)[,-1]
	mod=glmnet(xmat,y,alpha=0,lambda=grid)
	cvout=cv.glmnet(xmat,y,alpha=0)
	bestlam=cvout$lambda.min
	ridge.coef=predict(mod,type="coefficients",s=bestlam)
	ridge.coef
	```
	Model performance
	```{r cache=TRUE,echo=FALSE}
	xmat=model.matrix(Income~.,data=marketing[train,])[,-1]
	mod=glmnet(xmat,y[train],alpha=0,lambda=grid)
	cvout=cv.glmnet(xmat,y[train],alpha=0)
	bestlam=cvout$lambda.min
	ridge.coef=predict(mod,type="coefficients",s=bestlam)
	pred=predict(mod,model.matrix(Income~.,data=marketing[test,])[,-1],s=bestlam)
	mean((y[test]-pred)^2)
	```

	## PCR

	Number of Component selection
	```{r cache=TRUE, echo=FALSE}
	mod=pcr(Income~., data=marketing, validation="CV")
	summary(mod)
	plot(mod)
	validationplot(mod,val.type="MSEP")
	```
	Test performance
	```{r cache=TRUE, echo=FALSE}
	modTrain=pcr(Income~., data=marketing[train,], validation="CV")
	pred=predict(mod,newdata=marketing[test,],ncomp=9)
	mean((y[test]-pred)^2)
	```