joaovissoci/predictive_models.R

## predictive_models.R
#Prediction models in R

#Packages needed:
#"Deducer"
#"tree"
#"randomForest"
#"reshape"

lapply(c("Deduces","randomForest","tree","reshape"),
library, character.only=T)

############################################################################
#### Logisti Regression
############################################################################

#recoding outcome variable
Prestige$income<-car::recode(Prestige$income,"0:5930.5='low income';else='high income'")
Prestige$income<-as.factor(Prestige$income)
#fitting a model or prediction of Menarche by Age
fit = glm(income  ~ education+women+prestige+census,
               family=binomial, data=Prestige)
summary(fit) # display results
confint(fit) # 95% CI for the coefficients
exp(coef(fit)) # exponentiated coefficients
exp(confint(fit)) # 95% CI for exponentiated coefficients
residuals(fit, type="deviance") # residuals
plot(predict(fit, type="response"))

rocplot(fit,pred.prob.labels=TRUE,prob.label.digits=3,AUC=TRUE)

############################################################################
#### Bootstrapping
############################################################################


############################################################################
#### Classification Tree and Random Forest Predicitive Models
############################################################################

data_positive<-subset(Prestige,income=="high income")
dim(data_positive)
data_negative<-subset(Prestige,income=="low income")
dim(data_negative)
#data_positive <- data_positive[sample(1:nrow(data_positive), 50,
#     replace=FALSE),]
data_sample<-rbind(data_positive,data_negative)
dim(data_sample)

data_treatment<-with(data_sample,data.frame(education,women,prestige,census,income))

set.seed(1234)
id <- sample(1:2,nrow(data_treatment),replace=TRUE)
List <- split(data_treatment,id)
names(List) <- c("Train","Test")
train = List$Train
test = List$Test
dim(train)
dim(test)
head(train)

#Tree Analysis
#library(tree)
tree1 = tree(income ~., data= train)
summary(tree1)
tree.screens()
plot(tree1)
text(tree1)
tile.tree(tree1, train$income, axes = TRUE)
close.screen(all = TRUE)
names(tree1)
tree1$frame

#pruning
set.seed(1234)
par(mfrow=c(1,2))
plot(cv.tree(tree1,FUN=prune.tree,method="misclass"))
plot(cv.tree(tree1))
pruneTree <- prune.tree(tree1,best=2)

tree.screens()
plot(pruneTree)
text(pruneTree)
tile.tree(pruneTree, train$income, axes = TRUE)
close.screen(all = TRUE)
summary(pruneTree)
pruneTree

#Cross Validation
pred1 = predict(tree1, test, type = "class")
t = table(observed=test[,'income'], predict=pred1)
t

#random forest analysis
#library(randomForest)
rf1 = randomForest(income ~., importance = TRUE, proximity = TRUE, ntree= 100000, data = train, na.action = na.omit)
rf1
varImpPlot(rf1, main = "Variable Importance")
rf1$importance

#predicting using random forest
pred = predict(rf1, newdata=test)
pred
table(observable = test$income, predicted = pred)
	#Prediction models in R

	#Packages needed:
	#"Deducer"
	#"tree"
	#"randomForest"
	#"reshape"

	lapply(c("Deduces","randomForest","tree","reshape"),
	library, character.only=T)

	############################################################################
	#### Logisti Regression
	############################################################################

	#recoding outcome variable
	Prestige$income<-car::recode(Prestige$income,"0:5930.5='low income';else='high income'")
	Prestige$income<-as.factor(Prestige$income)
	#fitting a model or prediction of Menarche by Age
	fit = glm(income ~ education+women+prestige+census,
	family=binomial, data=Prestige)
	summary(fit) # display results
	confint(fit) # 95% CI for the coefficients
	exp(coef(fit)) # exponentiated coefficients
	exp(confint(fit)) # 95% CI for exponentiated coefficients
	residuals(fit, type="deviance") # residuals
	plot(predict(fit, type="response"))

	rocplot(fit,pred.prob.labels=TRUE,prob.label.digits=3,AUC=TRUE)

	############################################################################
	#### Bootstrapping
	############################################################################


	############################################################################
	#### Classification Tree and Random Forest Predicitive Models
	############################################################################

	data_positive<-subset(Prestige,income=="high income")
	dim(data_positive)
	data_negative<-subset(Prestige,income=="low income")
	dim(data_negative)
	#data_positive <- data_positive[sample(1:nrow(data_positive), 50,
	# replace=FALSE),]
	data_sample<-rbind(data_positive,data_negative)
	dim(data_sample)

	data_treatment<-with(data_sample,data.frame(education,women,prestige,census,income))

	set.seed(1234)
	id <- sample(1:2,nrow(data_treatment),replace=TRUE)
	List <- split(data_treatment,id)
	names(List) <- c("Train","Test")
	train = List$Train
	test = List$Test
	dim(train)
	dim(test)
	head(train)

	#Tree Analysis
	#library(tree)
	tree1 = tree(income ~., data= train)
	summary(tree1)
	tree.screens()
	plot(tree1)
	text(tree1)
	tile.tree(tree1, train$income, axes = TRUE)
	close.screen(all = TRUE)
	names(tree1)
	tree1$frame

	#pruning
	set.seed(1234)
	par(mfrow=c(1,2))
	plot(cv.tree(tree1,FUN=prune.tree,method="misclass"))
	plot(cv.tree(tree1))
	pruneTree <- prune.tree(tree1,best=2)

	tree.screens()
	plot(pruneTree)
	text(pruneTree)
	tile.tree(pruneTree, train$income, axes = TRUE)
	close.screen(all = TRUE)
	summary(pruneTree)
	pruneTree

	#Cross Validation
	pred1 = predict(tree1, test, type = "class")
	t = table(observed=test[,'income'], predict=pred1)
	t

	#random forest analysis
	#library(randomForest)
	rf1 = randomForest(income ~., importance = TRUE, proximity = TRUE, ntree= 100000, data = train, na.action = na.omit)
	rf1
	varImpPlot(rf1, main = "Variable Importance")
	rf1$importance

	#predicting using random forest
	pred = predict(rf1, newdata=test)
	pred
	table(observable = test$income, predicted = pred)