Skip to content

Instantly share code, notes, and snippets.

@rpietro
Created August 16, 2013 12:23
Show Gist options
  • Save rpietro/6249436 to your computer and use it in GitHub Desktop.
Save rpietro/6249436 to your computer and use it in GitHub Desktop.
regression scripts from Dalgaard's book
# regression models from dalgaard
library("ISwR")
?thuesen
thuesen
attach(thuesen)
lm(short.velocity~blood.glucose)
summary(lm(short.velocity~blood.glucose)) #the median should not be far from zero, and the minimum and maximum should be roughly equal in absolute value
plot(blood.glucose,short.velocity)
abline(lm(short.velocity~blood.glucose))
lm.velo <- lm(short.velocity~blood.glucose)
fitted(lm.velo)
resid(lm.velo)
plot(blood.glucose,short.velocity)
lines(blood.glucose,fitted(lm.velo))
lines(blood.glucose[!is.na(short.velocity)],fitted(lm.velo))
cc <- complete.cases(thuesen)
options(na.action=na.exclude)
lm.velo <- lm(short.velocity~blood.glucose)
fitted(lm.velo)
?segments
segments(blood.glucose, fitted(lm.velo), blood.glucose, short.velocity)
plot(fitted(lm.velo),resid(lm.velo))
qqnorm(resid(lm.velo))
predict(lm.velo)
?predict
predict(lm.velo, int="confidence")
#definition confidence interval: if confidence intervals are constructed across many separate data analyses of repeated (and possibly different) experiments, the proportion of such intervals that contain the true value of the parameter will match the confidence level; this is guaranteed by the reasoning underlying the construction of confidence intervals
pred.frame <- data.frame(blood.glucose=1:24)
head(pred.frame)
head(lm.velo)
pp <- predict(lm.velo, int="p", newdata=pred.frame)
pc <- predict(lm.velo, int="c", newdata=pred.frame)
plot(blood.glucose,short.velocity, ylim=range(short.velocity, pp, na.rm=T))
pred.gluc <- pred.frame$blood.glucose
?matlines
matlines(pred.gluc, pc, lty=c(2,3,3), col="red")
matlines(pred.gluc, pp, lty=c(1,3,3), col="blue")
# multiple regression
?par
par(mex=0.5) #‘mex’ is a character size expansion factor which is used to
# describe coordinates in the margins of plots. Note that this
# does not change the font size, rather specifies the size of
# font (as a multiple of ‘csi’) used to convert between ‘mar’
# and ‘mai’, and between ‘oma’ and ‘omi’.
pairs(cystfibr, gap=0, cex.labels=0.7) #gap: distance between subplots, in margin lines. cex.labels, font.labels: graphics parameters for the text panel.
attach(cystfibr)
lm(pemax~age+sex+height+weight+bmp+fev1+rv+frc+tlc)
summary(lm(pemax~age+sex+height+weight+bmp+fev1+rv+frc+tlc))
anova(lm(pemax~age+sex+height+weight+bmp+fev1+rv+frc+tlc)) # these tests are successive; they correspond to (reading upward from the bottom) a stepwise removal of terms from the model until finally only age is left.
m1<-lm(pemax~age+sex+height+weight+bmp+fev1+rv+frc+tlc)
m2<-lm(pemax~age)
anova(m1,m2)
# > 955.4+155.0+632.3+2862.2+1549.1+561.9+194.6+92.4
# [1] 7002.9
# > 7002.9/8
# [1] 875.3625
# > 875.36/648.7
# [1] 1.349407
# > 1-pf(1.349407,8,15)
# [1] 0.2935148
# polynomial regression
attach(cystfibr)
summary(lm(pemax~height+I(height^2)))
pred.frame <- data.frame(height=seq(110,180,2))
lm.pemax.hq <- lm(pemax~height+I(height^2))
predict(lm.pemax.hq,interval="pred",newdata=pred.frame)
pp <- predict(lm.pemax.hq,newdata=pred.frame,interval="pred")
pc <- predict(lm.pemax.hq,newdata=pred.frame,interval="conf")
plot(height,pemax,ylim=c(0,200))
matlines(pred.frame$height,pp,lty=c(1,2,2),col="black")
matlines(pred.frame$height,pc,lty=c(1,3,3),col="black")
# regression through the origin
?runif
x <- runif(20)
x
?rnorm
y <- 2*x+rnorm(20,0,0.3)
y
summary(lm(y~x))
summary(lm(y~x-1))
anova(lm(y~x))
# design matrices
# If you add the columns together, weighted by the corresponding regression coefficients, you get the fitted values. the intercept enters as the coefficient to a column of ones.
model.matrix(pemax~height+weight)
attach(red.cell.folate)
model.matrix(folate~ventilation) #for a model with factors then you have dummy variables
# diagnostics
attach(thuesen)
options(na.action="na.exclude")
lm.velo <- lm(short.velocity~blood.glucose)
opar <- par(mfrow=c(2,2), mex=0.6, mar=c(4,4,3,2)+.3)
plot(lm.velo, which=1:4) # The third plot is of the square root of the absolute value of the standardized residuals; this reduces the skewness of the distribution and makes it much easier to detect if there might be a trend in the dispersion. The fourth plot is of “Cook’s distance”, which is a measure of the influence of each observation on the regression coefficients
par(opar)
opar <- par(mfrow=c(2,2), mex=0.6, mar=c(4,4,3,2)+.3)
plot(rstandard(lm.velo)) # standardized residuals since residuals corresponding to extreme x-values generally have a lower SD due to overfitting
plot(rstudent(lm.velo))
plot(dffits(lm.velo),type="l") #how much an observation affects the as- sociated fitted value
matplot(dfbetas(lm.velo),type="l", col="red") #change in the estimated parameters if an observation is excluded relative to its standard error.
lines(sqrt(cooks.distance(lm.velo)), lwd=2)
par(opar)
summary(lm(short.velocity~blood.glucose, subset=-13))
# logistic regression
no.yes <- c("No","Yes")
?gl
smoking <- gl(2,1,8,no.yes)
obesity <- gl(2,2,8,no.yes)
snoring <- gl(2,4,8,no.yes)
n.tot <- c(60,17,8,2,187,85,51,23)
n.hyp <- c(5,2,1,0,35,13,15,8)
data.frame(smoking,obesity,snoring,n.tot,n.hyp)
expand.grid(smoking=no.yes, obesity=no.yes, snoring=no.yes)
hyp.tbl <- cbind(n.hyp,n.tot-n.hyp)
hyp.tbl
glm(hyp.tbl ~ smoking + obesity + snoring, family=binomial("logit"))
glm(hyp.tbl ~ smoking + obesity + snoring, binomial)
prop.hyp <- n.hyp/n.tot
glm.hyp <- glm(prop.hyp~smoking+obesity+snoring,binomial,weights=n.tot) # The other way to specify a logistic regression model is to give the proportion of diseased in each cell. It is necessary to give weights because R cannot see how many observations a proportion is based on.
glm.hyp <- glm(hyp.tbl~smoking+obesity+snoring,binomial)
summary(glm.hyp)
glm.hyp <- glm(hyp.tbl~obesity+snoring,binomial)
summary(glm.hyp)
glm.hyp <- glm(hyp.tbl~smoking+obesity+snoring,binomial)
anova(glm.hyp, test="Chisq")
glm.hyp <- glm(hyp.tbl~snoring+obesity+smoking,binomial)
anova(glm.hyp, test="Chisq")
glm.hyp <- glm(hyp.tbl~obesity+snoring,binomial)
anova(glm.hyp, test="Chisq")
drop1(glm.hyp, test="Chisq")
exp(cbind(OR=coef(glm.hyp), confint(glm.hyp)))
library("ISwR")
juul$menarche <- factor(juul$menarche, labels=c("No","Yes"))
juul$tanner <- factor(juul$tanner)
juul.girl <- subset(juul,age>8 & age<20 & complete.cases(menarche))
attach(juul.girl)
summary(glm(menarche ~ age,binomial))
summary(glm(menarche ~ age+tanner,binomial))
drop1(glm(menarche ~ age+tanner,binomial),test="Chisq")
predict(glm.hyp)
predict(glm.hyp, type="response") #obtain probabilities
plot(age, fitted(glm(menarche~age,binomial)))
glm.menarche <- glm(menarche~age, binomial)
Age <- seq(8,20,.1)
newages <- data.frame(age=Age)
newages
predicted.probability <- predict(glm.menarche, newages, type="resp")
plot(predicted.probability ~ Age, type="l")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment