josealvarez97/LogitLalondeDataSet.R

## LogitLalondeDataSet.R
# Logistic Regression and lalonde data set

library(Matching)
data(lalonde)
head(lalonde)
summary(lalonde)
View(lalonde)
??lalonde

# Note: For simplicity, all operations will be performed over the training model.

# Let's fit a logit model.
lalonde.glm.fit = glm(treat~age+educ,data=lalonde,family=binomial)
# Probabilities of the response being true. treat=1
glm.probs = predict(lalonde.glm.fit,type="response")


#(PROPENSITY SCORES)
glm.probs[1:10] # this vector contains the probabilities of treat = 1 for each observation.

# Let's have a vector with readable labels...
dim(lalonde)
glm.pred=rep("NoTreat",445)

glm.pred[glm.probs>.5]="Treat"
glm.pred


# Let's determine the fraction of correct predictions (RIGHT NOW, THIS DOES NOT MAKE SENSE "ENTIRELY" AS WE ARE DOING IT WITH THE TRAINING SET...for simplicity)
table(glm.pred,lalonde$treat)
(253+9)/445 # equals 0.588764
#or
#mean(glm.pred==lalonde$treat) # WRONG, in data set treat is either 1 or 0
mean( (glm.pred=="Treat")&(lalonde$treat==1) ) + mean( (glm.pred=="NoTreat")&(lalonde$treat==0) )
# a bit messy calculation but it shows that it worked. (Both equaled 0.588764)
# (Fraction of correct predictions)
# Again, this could be improved by cutting data set into training and testint set...

# Perhaps I should not even change / use labels "NoTreat" and "Treat" and rather just 0 and 1 for simplicity... too.

## TextbookRLab.R

# R Lab in Textbook

install.packages(("ISLR"))
library(ISLR)
names(Smarket)
dim(Smarket)
summary(Smarket)
head(Smarket)
cor(Smarket) #Gives error message, as Direction is qualitative
head(Smarket[,-9])
cor(Smarket[,-9])

#attach(Smarket) for calling just plot(Volume)
plot(Smarket$Volume)


# Logistic Regression

# To predict Direction based on Lag1, Lag5, and Volume.


# glm() generalized linear models
# that include logistic regression

# just as ln(), but we pass family=binomial
# for specifying a logistic model


glm.fit = glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume, data=Smarket, family=binomial)
summary(glm.fit)


# we use coef() in order to ACCESS THE COEFFICIENTS
# OF THIS FITTED MODEL.
coef(glm.fit)


# we can also use summary to access a particular aspect
# of the model.
summary(glm.fit)$coef

summary(glm.fit)$coef[,4]


# predict()

## to predict probabilities based on predictors

## Note: if no dataset is supplied to predict(),
## the training data that was used to fit the model
## is used by the function.

## type="response" tells R to output probabilities
## of the form P(Y=1|X).

glm.probs = predict(glm.fit,type="response")
glm.probs[1:10]

# we know that these values correspond to the
# probabilities of the market going up, as
# contrasts() function indicates that R has
# created a dummy variables with a 1 for Up
contrasts(Direction)


# To make predictions, we covert predicted probabilities
# into class labels
# Create a vector of 1250 Down elements
glm.pred=rep("Down",1250)
# Transforms to Up all elements for which the
# predicted probability of market increase exceeds .5
glm.pred[glm.probs>.5]="Up"
glm.pred

# table() allows to determine how many are each
table(glm.pred,Direction)
# diagonal elements indicate correct predictions.
(507+145)/1250
# mean can also be used to compute fraction of
# days for which the prediction was correct.
mean(glm.pred==Direction)


# Testing the model by using the training set is misleading
# Let's hold out a part of the data set to use as the test set later.
train=(Year<2005)
View(train)
Smarket.2005=Smarket[!train,] #held out data or test set
dim(Smarket.2005) # "dimensions of array"
View(Smarket.2005)
Direction.2005=Direction[!train] #only direction column
View(Direction) #cool...

# We now fit a logistic regression model using
# only the subset of observations before 2005 (training set)
glm.fit=glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume,data=Smarket,family=binomial,subset=train)

# We then obtain predicted probabilities of stock market
# going up for each of the days in the test set (days in 2005)
glm.probs=predict(glm.fit,Smarket.2005,type="response")


# TRAINING WAS PERFORMED FOR DATES < 2005
# TESTING WAS PERFORMED FOR DATES IN 2005

glm.pred=rep("Down",252)
glm.pred[glm.probs>.5]="Up"
table(glm.pred,Direction.2005)
mean(glm.pred==Direction.2005) # actual percentage of effectivity

mean(glm.pred!=Direction.2005) # fraction of errors... test set error rate
# 0.52 worse than random guessing...


# Below we remove all but Lag1 and Lag2,
# which seemed to have the highest predictive power
glm.fit=glm(Direction~Lag1+Lag2,data=Smarket,family=binomial,subset=train)
glm.probs=predict(glm.fit,Smarket.2005,type="response")
glm.pred=rep("Down",252)
glm.pred[glm.probs>.5]="Up"
table(glm.pred,Direction.2005)
mean(glm.pred==Direction.2005) #56% of daily movements correctly predicted
106/(106+76)#58% for predicting increases in market


# IF WE WANT TO PREDICT FOR SPECIFIC VALUES
predict(glm.fit,newdata=data.frame(Lag1=c(1.2,1.5),Lag2=c(1.1,-0.8)),type="response")

predict(glm.fit,newdata=data.frame(Lag1=1.2,Lag2=1.1),type="response")
predict(glm.fit,newdata=data.frame(Lag1=1.5,Lag2=-0.8),type="response")


# Logistic Regression and lalonde data set

library(Matching)
data(lalonde)
head(lalonde)
summary(lalonde)
View(lalonde)
??lalonde

# Note: For simplicity, all operations will be performed over the training model.

# Let's fit a logit model.
lalonde.glm.fit = glm(treat~age+educ,data=lalonde,family=binomial)
# Probabilities of the response being true. treat=1
glm.probs = predict(lalonde.glm.fit,type="response")
	# Logistic Regression and lalonde data set

	library(Matching)
	data(lalonde)
	head(lalonde)
	summary(lalonde)
	View(lalonde)
	??lalonde

	# Note: For simplicity, all operations will be performed over the training model.

	# Let's fit a logit model.
	lalonde.glm.fit = glm(treat~age+educ,data=lalonde,family=binomial)
	# Probabilities of the response being true. treat=1
	glm.probs = predict(lalonde.glm.fit,type="response")


	#(PROPENSITY SCORES)
	glm.probs[1:10] # this vector contains the probabilities of treat = 1 for each observation.

	# Let's have a vector with readable labels...
	dim(lalonde)
	glm.pred=rep("NoTreat",445)

	glm.pred[glm.probs>.5]="Treat"
	glm.pred


	# Let's determine the fraction of correct predictions (RIGHT NOW, THIS DOES NOT MAKE SENSE "ENTIRELY" AS WE ARE DOING IT WITH THE TRAINING SET...for simplicity)
	table(glm.pred,lalonde$treat)
	(253+9)/445 # equals 0.588764
	#or
	#mean(glm.pred==lalonde$treat) # WRONG, in data set treat is either 1 or 0
	mean( (glm.pred=="Treat")&(lalonde$treat==1) ) + mean( (glm.pred=="NoTreat")&(lalonde$treat==0) )
	# a bit messy calculation but it shows that it worked. (Both equaled 0.588764)
	# (Fraction of correct predictions)
	# Again, this could be improved by cutting data set into training and testint set...

	# Perhaps I should not even change / use labels "NoTreat" and "Treat" and rather just 0 and 1 for simplicity... too.

	# R Lab in Textbook

	install.packages(("ISLR"))
	library(ISLR)
	names(Smarket)
	dim(Smarket)
	summary(Smarket)
	head(Smarket)
	cor(Smarket) #Gives error message, as Direction is qualitative
	head(Smarket[,-9])
	cor(Smarket[,-9])

	#attach(Smarket) for calling just plot(Volume)
	plot(Smarket$Volume)




	# Logistic Regression

	# To predict Direction based on Lag1, Lag5, and Volume.


	# glm() generalized linear models
	# that include logistic regression

	# just as ln(), but we pass family=binomial
	# for specifying a logistic model


	glm.fit = glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume, data=Smarket, family=binomial)
	summary(glm.fit)


	# we use coef() in order to ACCESS THE COEFFICIENTS
	# OF THIS FITTED MODEL.
	coef(glm.fit)


	# we can also use summary to access a particular aspect
	# of the model.
	summary(glm.fit)$coef

	summary(glm.fit)$coef[,4]


	# predict()

	## to predict probabilities based on predictors

	## Note: if no dataset is supplied to predict(),
	## the training data that was used to fit the model
	## is used by the function.

	## type="response" tells R to output probabilities
	## of the form P(Y=1\|X).

	glm.probs = predict(glm.fit,type="response")
	glm.probs[1:10]

	# we know that these values correspond to the
	# probabilities of the market going up, as
	# contrasts() function indicates that R has
	# created a dummy variables with a 1 for Up
	contrasts(Direction)



	# To make predictions, we covert predicted probabilities
	# into class labels
	# Create a vector of 1250 Down elements
	glm.pred=rep("Down",1250)
	# Transforms to Up all elements for which the
	# predicted probability of market increase exceeds .5
	glm.pred[glm.probs>.5]="Up"
	glm.pred

	# table() allows to determine how many are each
	table(glm.pred,Direction)
	# diagonal elements indicate correct predictions.
	(507+145)/1250
	# mean can also be used to compute fraction of
	# days for which the prediction was correct.
	mean(glm.pred==Direction)




	# Testing the model by using the training set is misleading
	# Let's hold out a part of the data set to use as the test set later.
	train=(Year<2005)
	View(train)
	Smarket.2005=Smarket[!train,] #held out data or test set
	dim(Smarket.2005) # "dimensions of array"
	View(Smarket.2005)
	Direction.2005=Direction[!train] #only direction column
	View(Direction) #cool...

	# We now fit a logistic regression model using
	# only the subset of observations before 2005 (training set)
	glm.fit=glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume,data=Smarket,family=binomial,subset=train)

	# We then obtain predicted probabilities of stock market
	# going up for each of the days in the test set (days in 2005)
	glm.probs=predict(glm.fit,Smarket.2005,type="response")


	# TRAINING WAS PERFORMED FOR DATES < 2005
	# TESTING WAS PERFORMED FOR DATES IN 2005

	glm.pred=rep("Down",252)
	glm.pred[glm.probs>.5]="Up"
	table(glm.pred,Direction.2005)
	mean(glm.pred==Direction.2005) # actual percentage of effectivity

	mean(glm.pred!=Direction.2005) # fraction of errors... test set error rate
	# 0.52 worse than random guessing...




	# Below we remove all but Lag1 and Lag2,
	# which seemed to have the highest predictive power
	glm.fit=glm(Direction~Lag1+Lag2,data=Smarket,family=binomial,subset=train)
	glm.probs=predict(glm.fit,Smarket.2005,type="response")
	glm.pred=rep("Down",252)
	glm.pred[glm.probs>.5]="Up"
	table(glm.pred,Direction.2005)
	mean(glm.pred==Direction.2005) #56% of daily movements correctly predicted
	106/(106+76)#58% for predicting increases in market



	# IF WE WANT TO PREDICT FOR SPECIFIC VALUES
	predict(glm.fit,newdata=data.frame(Lag1=c(1.2,1.5),Lag2=c(1.1,-0.8)),type="response")

	predict(glm.fit,newdata=data.frame(Lag1=1.2,Lag2=1.1),type="response")
	predict(glm.fit,newdata=data.frame(Lag1=1.5,Lag2=-0.8),type="response")



	# Logistic Regression and lalonde data set

	library(Matching)
	data(lalonde)
	head(lalonde)
	summary(lalonde)
	View(lalonde)
	??lalonde

	# Note: For simplicity, all operations will be performed over the training model.

	# Let's fit a logit model.
	lalonde.glm.fit = glm(treat~age+educ,data=lalonde,family=binomial)
	# Probabilities of the response being true. treat=1
	glm.probs = predict(lalonde.glm.fit,type="response")