Skip to content

Instantly share code, notes, and snippets.

@josealvarez97
Created October 3, 2018 11:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save josealvarez97/2f0f805345fcc1ad79e6fa0316c2b701 to your computer and use it in GitHub Desktop.
Save josealvarez97/2f0f805345fcc1ad79e6fa0316c2b701 to your computer and use it in GitHub Desktop.
JoseAlvarezCabrera-MakeUpWork-Session3.2-CS112
# Logistic Regression and lalonde data set
library(Matching)
data(lalonde)
head(lalonde)
summary(lalonde)
View(lalonde)
??lalonde
# Note: For simplicity, all operations will be performed over the training model.
# Let's fit a logit model.
lalonde.glm.fit = glm(treat~age+educ,data=lalonde,family=binomial)
# Probabilities of the response being true. treat=1
glm.probs = predict(lalonde.glm.fit,type="response")
#(PROPENSITY SCORES)
glm.probs[1:10] # this vector contains the probabilities of treat = 1 for each observation.
# Let's have a vector with readable labels...
dim(lalonde)
glm.pred=rep("NoTreat",445)
glm.pred[glm.probs>.5]="Treat"
glm.pred
# Let's determine the fraction of correct predictions (RIGHT NOW, THIS DOES NOT MAKE SENSE "ENTIRELY" AS WE ARE DOING IT WITH THE TRAINING SET...for simplicity)
table(glm.pred,lalonde$treat)
(253+9)/445 # equals 0.588764
#or
#mean(glm.pred==lalonde$treat) # WRONG, in data set treat is either 1 or 0
mean( (glm.pred=="Treat")&(lalonde$treat==1) ) + mean( (glm.pred=="NoTreat")&(lalonde$treat==0) )
# a bit messy calculation but it shows that it worked. (Both equaled 0.588764)
# (Fraction of correct predictions)
# Again, this could be improved by cutting data set into training and testint set...
# Perhaps I should not even change / use labels "NoTreat" and "Treat" and rather just 0 and 1 for simplicity... too.
# R Lab in Textbook
install.packages(("ISLR"))
library(ISLR)
names(Smarket)
dim(Smarket)
summary(Smarket)
head(Smarket)
cor(Smarket) #Gives error message, as Direction is qualitative
head(Smarket[,-9])
cor(Smarket[,-9])
#attach(Smarket) for calling just plot(Volume)
plot(Smarket$Volume)
# Logistic Regression
# To predict Direction based on Lag1, Lag5, and Volume.
# glm() generalized linear models
# that include logistic regression
# just as ln(), but we pass family=binomial
# for specifying a logistic model
glm.fit = glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume, data=Smarket, family=binomial)
summary(glm.fit)
# we use coef() in order to ACCESS THE COEFFICIENTS
# OF THIS FITTED MODEL.
coef(glm.fit)
# we can also use summary to access a particular aspect
# of the model.
summary(glm.fit)$coef
summary(glm.fit)$coef[,4]
# predict()
## to predict probabilities based on predictors
## Note: if no dataset is supplied to predict(),
## the training data that was used to fit the model
## is used by the function.
## type="response" tells R to output probabilities
## of the form P(Y=1|X).
glm.probs = predict(glm.fit,type="response")
glm.probs[1:10]
# we know that these values correspond to the
# probabilities of the market going up, as
# contrasts() function indicates that R has
# created a dummy variables with a 1 for Up
contrasts(Direction)
# To make predictions, we covert predicted probabilities
# into class labels
# Create a vector of 1250 Down elements
glm.pred=rep("Down",1250)
# Transforms to Up all elements for which the
# predicted probability of market increase exceeds .5
glm.pred[glm.probs>.5]="Up"
glm.pred
# table() allows to determine how many are each
table(glm.pred,Direction)
# diagonal elements indicate correct predictions.
(507+145)/1250
# mean can also be used to compute fraction of
# days for which the prediction was correct.
mean(glm.pred==Direction)
# Testing the model by using the training set is misleading
# Let's hold out a part of the data set to use as the test set later.
train=(Year<2005)
View(train)
Smarket.2005=Smarket[!train,] #held out data or test set
dim(Smarket.2005) # "dimensions of array"
View(Smarket.2005)
Direction.2005=Direction[!train] #only direction column
View(Direction) #cool...
# We now fit a logistic regression model using
# only the subset of observations before 2005 (training set)
glm.fit=glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume,data=Smarket,family=binomial,subset=train)
# We then obtain predicted probabilities of stock market
# going up for each of the days in the test set (days in 2005)
glm.probs=predict(glm.fit,Smarket.2005,type="response")
# TRAINING WAS PERFORMED FOR DATES < 2005
# TESTING WAS PERFORMED FOR DATES IN 2005
glm.pred=rep("Down",252)
glm.pred[glm.probs>.5]="Up"
table(glm.pred,Direction.2005)
mean(glm.pred==Direction.2005) # actual percentage of effectivity
mean(glm.pred!=Direction.2005) # fraction of errors... test set error rate
# 0.52 worse than random guessing...
# Below we remove all but Lag1 and Lag2,
# which seemed to have the highest predictive power
glm.fit=glm(Direction~Lag1+Lag2,data=Smarket,family=binomial,subset=train)
glm.probs=predict(glm.fit,Smarket.2005,type="response")
glm.pred=rep("Down",252)
glm.pred[glm.probs>.5]="Up"
table(glm.pred,Direction.2005)
mean(glm.pred==Direction.2005) #56% of daily movements correctly predicted
106/(106+76)#58% for predicting increases in market
# IF WE WANT TO PREDICT FOR SPECIFIC VALUES
predict(glm.fit,newdata=data.frame(Lag1=c(1.2,1.5),Lag2=c(1.1,-0.8)),type="response")
predict(glm.fit,newdata=data.frame(Lag1=1.2,Lag2=1.1),type="response")
predict(glm.fit,newdata=data.frame(Lag1=1.5,Lag2=-0.8),type="response")
# Logistic Regression and lalonde data set
library(Matching)
data(lalonde)
head(lalonde)
summary(lalonde)
View(lalonde)
??lalonde
# Note: For simplicity, all operations will be performed over the training model.
# Let's fit a logit model.
lalonde.glm.fit = glm(treat~age+educ,data=lalonde,family=binomial)
# Probabilities of the response being true. treat=1
glm.probs = predict(lalonde.glm.fit,type="response")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment