rpietro/regression_dalgaard.r

## regression_dalgaard.r
# regression models from dalgaard

library("ISwR")
?thuesen
thuesen
attach(thuesen)
lm(short.velocity~blood.glucose)
summary(lm(short.velocity~blood.glucose)) #the median should not be far from zero, and the minimum and maximum should be roughly equal in absolute value
plot(blood.glucose,short.velocity)
abline(lm(short.velocity~blood.glucose))

lm.velo <- lm(short.velocity~blood.glucose)
fitted(lm.velo)
resid(lm.velo)
plot(blood.glucose,short.velocity)
lines(blood.glucose,fitted(lm.velo))
lines(blood.glucose[!is.na(short.velocity)],fitted(lm.velo))
cc <- complete.cases(thuesen)
options(na.action=na.exclude)


lm.velo <- lm(short.velocity~blood.glucose)
fitted(lm.velo)
?segments
segments(blood.glucose, fitted(lm.velo), blood.glucose, short.velocity)
plot(fitted(lm.velo),resid(lm.velo))
qqnorm(resid(lm.velo))
predict(lm.velo)
?predict
predict(lm.velo, int="confidence")

#definition confidence interval: if confidence intervals are constructed across many separate data analyses of repeated (and possibly different) experiments, the proportion of such intervals that contain the true value of the parameter will match the confidence level; this is guaranteed by the reasoning underlying the construction of confidence intervals

pred.frame <- data.frame(blood.glucose=1:24)
head(pred.frame)
head(lm.velo)
pp <- predict(lm.velo, int="p", newdata=pred.frame)
pc <- predict(lm.velo, int="c", newdata=pred.frame)
plot(blood.glucose,short.velocity, ylim=range(short.velocity, pp, na.rm=T))
pred.gluc <- pred.frame$blood.glucose
?matlines
matlines(pred.gluc, pc, lty=c(2,3,3), col="red")
matlines(pred.gluc, pp, lty=c(1,3,3), col="blue")


# multiple regression

?par
par(mex=0.5) #‘mex’ is a character size expansion factor which is used to
          # describe coordinates in the margins of plots. Note that this
          # does not change the font size, rather specifies the size of
          # font (as a multiple of ‘csi’) used to convert between ‘mar’
          # and ‘mai’, and between ‘oma’ and ‘omi’.
pairs(cystfibr, gap=0, cex.labels=0.7) #gap: distance between subplots, in margin lines. cex.labels, font.labels: graphics parameters for the text panel.

attach(cystfibr)
lm(pemax~age+sex+height+weight+bmp+fev1+rv+frc+tlc)
summary(lm(pemax~age+sex+height+weight+bmp+fev1+rv+frc+tlc))
anova(lm(pemax~age+sex+height+weight+bmp+fev1+rv+frc+tlc)) # these tests are successive; they correspond to (reading upward from the bottom) a stepwise removal of terms from the model until finally only age is left.
m1<-lm(pemax~age+sex+height+weight+bmp+fev1+rv+frc+tlc)
m2<-lm(pemax~age)
anova(m1,m2)

# > 955.4+155.0+632.3+2862.2+1549.1+561.9+194.6+92.4
# [1] 7002.9
# > 7002.9/8
# [1] 875.3625
# > 875.36/648.7
# [1] 1.349407
# > 1-pf(1.349407,8,15)
# [1] 0.2935148


# polynomial regression
attach(cystfibr)
summary(lm(pemax~height+I(height^2)))
pred.frame <- data.frame(height=seq(110,180,2))
lm.pemax.hq <- lm(pemax~height+I(height^2))
predict(lm.pemax.hq,interval="pred",newdata=pred.frame)
pp <- predict(lm.pemax.hq,newdata=pred.frame,interval="pred")
pc <- predict(lm.pemax.hq,newdata=pred.frame,interval="conf")
plot(height,pemax,ylim=c(0,200))
matlines(pred.frame$height,pp,lty=c(1,2,2),col="black")
matlines(pred.frame$height,pc,lty=c(1,3,3),col="black")


# regression through the origin

?runif
x <- runif(20)
x
?rnorm
y <- 2*x+rnorm(20,0,0.3)
y
summary(lm(y~x))
summary(lm(y~x-1))
anova(lm(y~x))


# design matrices
# If you add the columns together, weighted by the corresponding regression coefficients, you get  the fitted values. the intercept enters as the coefficient to a column of ones.
model.matrix(pemax~height+weight)

attach(red.cell.folate)
model.matrix(folate~ventilation) #for a model with factors then you have dummy variables


# diagnostics
attach(thuesen)
options(na.action="na.exclude")
lm.velo <- lm(short.velocity~blood.glucose)
opar <- par(mfrow=c(2,2), mex=0.6, mar=c(4,4,3,2)+.3)
plot(lm.velo, which=1:4) # The third plot is of the square root of the absolute value of the standardized residuals; this reduces the skewness of the distribution and makes it much easier to detect if there might be a trend in the dispersion. The fourth plot is of “Cook’s distance”, which is a measure of the influence of each observation on the regression coefficients
par(opar)
opar <- par(mfrow=c(2,2), mex=0.6, mar=c(4,4,3,2)+.3)
plot(rstandard(lm.velo)) # standardized residuals since residuals corresponding to extreme x-values generally have a lower SD due to overfitting
plot(rstudent(lm.velo))
plot(dffits(lm.velo),type="l") #how much an observation affects the as- sociated fitted value
matplot(dfbetas(lm.velo),type="l", col="red") #change in the estimated parameters if an observation is excluded relative to its standard error.
lines(sqrt(cooks.distance(lm.velo)), lwd=2)
par(opar)
summary(lm(short.velocity~blood.glucose, subset=-13))


# logistic regression
no.yes <- c("No","Yes")
?gl
smoking <- gl(2,1,8,no.yes)
obesity <- gl(2,2,8,no.yes)
snoring <- gl(2,4,8,no.yes)
n.tot <- c(60,17,8,2,187,85,51,23)
n.hyp <- c(5,2,1,0,35,13,15,8)
data.frame(smoking,obesity,snoring,n.tot,n.hyp)
expand.grid(smoking=no.yes, obesity=no.yes, snoring=no.yes)
hyp.tbl <- cbind(n.hyp,n.tot-n.hyp)
hyp.tbl


glm(hyp.tbl ~ smoking + obesity + snoring, family=binomial("logit"))
glm(hyp.tbl ~ smoking + obesity + snoring, binomial)
prop.hyp <- n.hyp/n.tot
glm.hyp <- glm(prop.hyp~smoking+obesity+snoring,binomial,weights=n.tot) # The other way to specify a logistic regression model is to give the proportion of diseased in each cell. It is necessary to give weights because R cannot see how many observations a proportion is based on.
glm.hyp <- glm(hyp.tbl~smoking+obesity+snoring,binomial)
summary(glm.hyp)
glm.hyp <- glm(hyp.tbl~obesity+snoring,binomial)
summary(glm.hyp)
glm.hyp <- glm(hyp.tbl~smoking+obesity+snoring,binomial)
anova(glm.hyp, test="Chisq")
glm.hyp <- glm(hyp.tbl~snoring+obesity+smoking,binomial)
anova(glm.hyp, test="Chisq")
glm.hyp <- glm(hyp.tbl~obesity+snoring,binomial)
anova(glm.hyp, test="Chisq")
drop1(glm.hyp, test="Chisq")
exp(cbind(OR=coef(glm.hyp), confint(glm.hyp)))


library("ISwR")
juul$menarche <- factor(juul$menarche, labels=c("No","Yes"))
juul$tanner <- factor(juul$tanner)
juul.girl <- subset(juul,age>8 & age<20 & complete.cases(menarche))
attach(juul.girl)
summary(glm(menarche ~ age,binomial))
summary(glm(menarche ~ age+tanner,binomial))
drop1(glm(menarche ~ age+tanner,binomial),test="Chisq")


predict(glm.hyp)
predict(glm.hyp, type="response") #obtain probabilities
plot(age, fitted(glm(menarche~age,binomial)))
glm.menarche <- glm(menarche~age, binomial)
Age <- seq(8,20,.1)
newages <- data.frame(age=Age)
newages
predicted.probability <- predict(glm.menarche, newages, type="resp")
plot(predicted.probability ~ Age, type="l")
	# regression models from dalgaard

	library("ISwR")
	?thuesen
	thuesen
	attach(thuesen)
	lm(short.velocity~blood.glucose)
	summary(lm(short.velocity~blood.glucose)) #the median should not be far from zero, and the minimum and maximum should be roughly equal in absolute value
	plot(blood.glucose,short.velocity)
	abline(lm(short.velocity~blood.glucose))

	lm.velo <- lm(short.velocity~blood.glucose)
	fitted(lm.velo)
	resid(lm.velo)
	plot(blood.glucose,short.velocity)
	lines(blood.glucose,fitted(lm.velo))
	lines(blood.glucose[!is.na(short.velocity)],fitted(lm.velo))
	cc <- complete.cases(thuesen)
	options(na.action=na.exclude)


	lm.velo <- lm(short.velocity~blood.glucose)
	fitted(lm.velo)
	?segments
	segments(blood.glucose, fitted(lm.velo), blood.glucose, short.velocity)
	plot(fitted(lm.velo),resid(lm.velo))
	qqnorm(resid(lm.velo))
	predict(lm.velo)
	?predict
	predict(lm.velo, int="confidence")

	#definition confidence interval: if confidence intervals are constructed across many separate data analyses of repeated (and possibly different) experiments, the proportion of such intervals that contain the true value of the parameter will match the confidence level; this is guaranteed by the reasoning underlying the construction of confidence intervals

	pred.frame <- data.frame(blood.glucose=1:24)
	head(pred.frame)
	head(lm.velo)
	pp <- predict(lm.velo, int="p", newdata=pred.frame)
	pc <- predict(lm.velo, int="c", newdata=pred.frame)
	plot(blood.glucose,short.velocity, ylim=range(short.velocity, pp, na.rm=T))
	pred.gluc <- pred.frame$blood.glucose
	?matlines
	matlines(pred.gluc, pc, lty=c(2,3,3), col="red")
	matlines(pred.gluc, pp, lty=c(1,3,3), col="blue")


	# multiple regression

	?par
	par(mex=0.5) #‘mex’ is a character size expansion factor which is used to
	# describe coordinates in the margins of plots. Note that this
	# does not change the font size, rather specifies the size of
	# font (as a multiple of ‘csi’) used to convert between ‘mar’
	# and ‘mai’, and between ‘oma’ and ‘omi’.
	pairs(cystfibr, gap=0, cex.labels=0.7) #gap: distance between subplots, in margin lines. cex.labels, font.labels: graphics parameters for the text panel.

	attach(cystfibr)
	lm(pemax~age+sex+height+weight+bmp+fev1+rv+frc+tlc)
	summary(lm(pemax~age+sex+height+weight+bmp+fev1+rv+frc+tlc))
	anova(lm(pemax~age+sex+height+weight+bmp+fev1+rv+frc+tlc)) # these tests are successive; they correspond to (reading upward from the bottom) a stepwise removal of terms from the model until finally only age is left.
	m1<-lm(pemax~age+sex+height+weight+bmp+fev1+rv+frc+tlc)
	m2<-lm(pemax~age)
	anova(m1,m2)

	# > 955.4+155.0+632.3+2862.2+1549.1+561.9+194.6+92.4
	# [1] 7002.9
	# > 7002.9/8
	# [1] 875.3625
	# > 875.36/648.7
	# [1] 1.349407
	# > 1-pf(1.349407,8,15)
	# [1] 0.2935148


	# polynomial regression
	attach(cystfibr)
	summary(lm(pemax~height+I(height^2)))
	pred.frame <- data.frame(height=seq(110,180,2))
	lm.pemax.hq <- lm(pemax~height+I(height^2))
	predict(lm.pemax.hq,interval="pred",newdata=pred.frame)
	pp <- predict(lm.pemax.hq,newdata=pred.frame,interval="pred")
	pc <- predict(lm.pemax.hq,newdata=pred.frame,interval="conf")
	plot(height,pemax,ylim=c(0,200))
	matlines(pred.frame$height,pp,lty=c(1,2,2),col="black")
	matlines(pred.frame$height,pc,lty=c(1,3,3),col="black")



	# regression through the origin

	?runif
	x <- runif(20)
	x
	?rnorm
	y <- 2*x+rnorm(20,0,0.3)
	y
	summary(lm(y~x))
	summary(lm(y~x-1))
	anova(lm(y~x))


	# design matrices
	# If you add the columns together, weighted by the corresponding regression coefficients, you get the fitted values. the intercept enters as the coefficient to a column of ones.
	model.matrix(pemax~height+weight)

	attach(red.cell.folate)
	model.matrix(folate~ventilation) #for a model with factors then you have dummy variables




	# diagnostics
	attach(thuesen)
	options(na.action="na.exclude")
	lm.velo <- lm(short.velocity~blood.glucose)
	opar <- par(mfrow=c(2,2), mex=0.6, mar=c(4,4,3,2)+.3)
	plot(lm.velo, which=1:4) # The third plot is of the square root of the absolute value of the standardized residuals; this reduces the skewness of the distribution and makes it much easier to detect if there might be a trend in the dispersion. The fourth plot is of “Cook’s distance”, which is a measure of the influence of each observation on the regression coefficients
	par(opar)
	opar <- par(mfrow=c(2,2), mex=0.6, mar=c(4,4,3,2)+.3)
	plot(rstandard(lm.velo)) # standardized residuals since residuals corresponding to extreme x-values generally have a lower SD due to overfitting
	plot(rstudent(lm.velo))
	plot(dffits(lm.velo),type="l") #how much an observation affects the as- sociated fitted value
	matplot(dfbetas(lm.velo),type="l", col="red") #change in the estimated parameters if an observation is excluded relative to its standard error.
	lines(sqrt(cooks.distance(lm.velo)), lwd=2)
	par(opar)
	summary(lm(short.velocity~blood.glucose, subset=-13))



	# logistic regression
	no.yes <- c("No","Yes")
	?gl
	smoking <- gl(2,1,8,no.yes)
	obesity <- gl(2,2,8,no.yes)
	snoring <- gl(2,4,8,no.yes)
	n.tot <- c(60,17,8,2,187,85,51,23)
	n.hyp <- c(5,2,1,0,35,13,15,8)
	data.frame(smoking,obesity,snoring,n.tot,n.hyp)
	expand.grid(smoking=no.yes, obesity=no.yes, snoring=no.yes)
	hyp.tbl <- cbind(n.hyp,n.tot-n.hyp)
	hyp.tbl


	glm(hyp.tbl ~ smoking + obesity + snoring, family=binomial("logit"))
	glm(hyp.tbl ~ smoking + obesity + snoring, binomial)
	prop.hyp <- n.hyp/n.tot
	glm.hyp <- glm(prop.hyp~smoking+obesity+snoring,binomial,weights=n.tot) # The other way to specify a logistic regression model is to give the proportion of diseased in each cell. It is necessary to give weights because R cannot see how many observations a proportion is based on.
	glm.hyp <- glm(hyp.tbl~smoking+obesity+snoring,binomial)
	summary(glm.hyp)
	glm.hyp <- glm(hyp.tbl~obesity+snoring,binomial)
	summary(glm.hyp)
	glm.hyp <- glm(hyp.tbl~smoking+obesity+snoring,binomial)
	anova(glm.hyp, test="Chisq")
	glm.hyp <- glm(hyp.tbl~snoring+obesity+smoking,binomial)
	anova(glm.hyp, test="Chisq")
	glm.hyp <- glm(hyp.tbl~obesity+snoring,binomial)
	anova(glm.hyp, test="Chisq")
	drop1(glm.hyp, test="Chisq")
	exp(cbind(OR=coef(glm.hyp), confint(glm.hyp)))


	library("ISwR")
	juul$menarche <- factor(juul$menarche, labels=c("No","Yes"))
	juul$tanner <- factor(juul$tanner)
	juul.girl <- subset(juul,age>8 & age<20 & complete.cases(menarche))
	attach(juul.girl)
	summary(glm(menarche ~ age,binomial))
	summary(glm(menarche ~ age+tanner,binomial))
	drop1(glm(menarche ~ age+tanner,binomial),test="Chisq")


	predict(glm.hyp)
	predict(glm.hyp, type="response") #obtain probabilities
	plot(age, fitted(glm(menarche~age,binomial)))
	glm.menarche <- glm(menarche~age, binomial)
	Age <- seq(8,20,.1)
	newages <- data.frame(age=Age)
	newages
	predicted.probability <- predict(glm.menarche, newages, type="resp")
	plot(predicted.probability ~ Age, type="l")