sebnyberg/3.2.R

## 3.2.R

# ========================================
# ==============[ Q 3.2.1 ]===============
# ========================================
load('hilda.Rdata')
source('helpers.R')

hilda <- setup_data_base(hilda)

# form the first model by splitting the ages into groups of five
hilda$age.interval <- findInterval(hilda$age, c(50,55,60,65,70,75,80,85,90,95))

# compare number of samples in the intervals
hist(hilda$age.interval, breaks = c(1,2,3,4,5,6,7,8,9,10),
     main="Number of people per age interval",
     xlab="Age interval")

# save
pdf(file="3_2_1-hist-10categories.pdf")
hist(hilda$age.interval, breaks = c(1,2,3,4,5,6,7,8,9,10),
     main="Number of people per age interval",
     xlab="Age interval")
dev.off()

# merge last two categories for a more even distribution of data
hilda$age.interval <- findInterval(hilda$age, c(50,55,60,65,70,75,80,85))

# compare number of samples in the intervals
hist(hilda$age.interval, breaks = c(1,2,3,4,5,6,7,8),
     main="Number of people per age interval",
     xlab="Age interval")

# save
pdf(file="3_2_1-hist-8categories.pdf")
hist(hilda$age.interval, breaks = c(1,2,3,4,5,6,7,8),
     main="Number of people per age interval",
     xlab="Age interval")
dev.off()

# rename the factors..
hilda$age.interval <- factor(hilda$age.interval,
                             labels=c("50-54", "55-59", "60-64", "65-69", "70-74", "75-79", "80-84","85+"))

# make a list of failures
hilda$no.hospdays <- 1 - hilda$hospdays

# looks alright.. compare the values of the different proportions
# let's "aggregate" the values of x1, success, fails, sum_p according to the remaining variables
# which are x2 and gr. Then the summation function is applied to aggregated values.
agg<-aggregate(hilda[,c("hospdays","no.hospdays")],by=list(age.interval=hilda$age.interval),FUN=sum)
agg$total_events <- agg$hospdays + agg$no.hospdays

# find median for our categories
i = 1
agg$xmid <- NA
for (interval in levels(hilda$age.interval)) {
  agg$xmid[i] <- median(hilda$age[hilda$age.interval == interval])
  i = i + 1
}

# calculate proportions
agg$prop<-agg$hospdays/agg$total_events

# plot the proportions
plot(agg$age.interval, agg$prop, breaks=c(1,2,3,4,5,6,7,8),
     type="l", xlab = "age", ylab = "probability of spending at least one hospital day")

# save
pdf("3_2_1-proportions.pdf")
plot(agg$age.interval, agg$prop, breaks=c(1,2,3,4,5,6,7,8),
     type="l", xlab = "age", ylab = "probability of spending at least one hospital day")


# ========================================
# ==============[ Q 3.2.2 ]===============
# ========================================

# create the continuous model
model.age <- glm(hilda$hospdays ~ hilda$age, family = binomial)

# create a continuous model with a second order term
age <- hilda$age
age.sq <- hilda$age^2
model.age.trans <- glm(hilda$hospdays ~ age + age.sq^2, family = binomial)

# in order to plot we make predictions for a sequence of x-values
x1 <- seq(50,100,1)

odds.trans <- get_log_odds(model.age.trans, cbind(x1, x1^2))
odds.cont  <- get_log_odds(model.age, cbind(x1))
probs.trans <- odds.trans / (1 + odds.trans)
probs.cont <- odds.cont / (1 + odds.cont)

# plot proportions vs continous without transformation
plot(x1, probs.cont,
     type="l",
     ylim = c(0.05,0.40),
     xlim = c(50,100),
     xlab = "age",
     ylab = "probability of spending a hospital day",
     main="Proportions of categorical age\n vs probabilities of continuous age")
lines(agg$xmid, agg$prop, lty=2, col="blue")
points(agg$xmid, agg$prop, pch=8, col="blue")

# save to file
pdf(file="3_2_2-no-transformation.pdf")
plot(x1, probs.cont,
     type="l",
     ylim = c(0.05,0.40),
     xlim = c(50,100),
     xlab = "age",
     ylab = "probability of spending a hospital day",
     main="Proportions of categorical age\n vs probabilities of continuous age")
lines(agg$xmid, agg$prop, lty=2, col="blue")
points(agg$xmid, agg$prop, pch=8, col="blue")
dev.off()

# plot proportions vs our continuous logistic regression
plot(x1, probs.trans,
     type="l",
     ylim = c(0.05,0.30),
     xlim = c(50,100),
     xlab = "age",
     ylab = "probability of spending a hospital day",
     main="Proportions of categorical age\nvs probabilities of continuous age (transformed)")
lines(agg$xmid, agg$prop, lty=2, col="blue")
points(agg$xmid, agg$prop, pch=8, col="blue")

# save to file
pdf(file="3_2_2-with-transformation.pdf")
plot(x1, probs.trans,
     type="l",
     ylim = c(0.05,0.30),
     xlim = c(50,100),
     xlab = "age",
     ylab = "probability of spending a hospital day",
     main="Proportions of categorical age\nvs probabilities of continuous age (transformed)")
lines(agg$xmid, agg$prop, lty=2, col="blue")
points(agg$xmid, agg$prop, pch=8, col="blue")
dev.off()

model.current <- model.age.trans


# ========================================
# ==============[ Q 3.2.3 ]===============
# ========================================

# Hack... get the se.fit from the predict function and find someone who is 70 years old..

index.70yo <- min(which(hilda$age == 70))
xbhat <- predict(model.age.trans, se.fit = T)
se.70yo <- xbhat$se.fit[index.70yo]

summary(model.age.trans)

lnodds.70yo <- get_log_odds(model.age.trans, c(1, 70, 70^2))

probs.70yo <- exp(lnodds.70yo) / (1 + exp(lnodds.70yo))

odds_lo <- lnodds.70yo + 1.96 * se.70yo
odds_hi <- lnodds.70yo - 1.96 * se.70yo

ci_odds_lo <- exp(odds_lo)
ci_odds_lo
ci_odds_hi <- exp(odds_hi)
ci_odds_hi

ci_probs_lo <- ci_odds_lo / (1 + ci_odds_lo)
ci_probs_hi <- ci_odds_hi / (1 + ci_odds_hi)


# ========================================
# ==============[ Q 3.2.4 ]===============
# ========================================

# First we calculate the W matrix
yhat <- predict(model.current, type="response")
W <- diag(yhat * (1 - yhat))
X <- cbind(1, age, age.sq)
cov_matrix <- solve(t(X) %*% W %*% X)

# Did we get the correct result?
summary(model.current)$cov.unscaled
cov_matrix

# Indeedidoodely we did! Amazing

	# ========================================
	# ==============[ Q 3.2.1 ]===============
	# ========================================
	load('hilda.Rdata')
	source('helpers.R')

	hilda <- setup_data_base(hilda)

	# form the first model by splitting the ages into groups of five
	hilda$age.interval <- findInterval(hilda$age, c(50,55,60,65,70,75,80,85,90,95))

	# compare number of samples in the intervals
	hist(hilda$age.interval, breaks = c(1,2,3,4,5,6,7,8,9,10),
	main="Number of people per age interval",
	xlab="Age interval")

	# save
	pdf(file="3_2_1-hist-10categories.pdf")
	hist(hilda$age.interval, breaks = c(1,2,3,4,5,6,7,8,9,10),
	main="Number of people per age interval",
	xlab="Age interval")
	dev.off()

	# merge last two categories for a more even distribution of data
	hilda$age.interval <- findInterval(hilda$age, c(50,55,60,65,70,75,80,85))

	# compare number of samples in the intervals
	hist(hilda$age.interval, breaks = c(1,2,3,4,5,6,7,8),
	main="Number of people per age interval",
	xlab="Age interval")

	# save
	pdf(file="3_2_1-hist-8categories.pdf")
	hist(hilda$age.interval, breaks = c(1,2,3,4,5,6,7,8),
	main="Number of people per age interval",
	xlab="Age interval")
	dev.off()

	# rename the factors..
	hilda$age.interval <- factor(hilda$age.interval,
	labels=c("50-54", "55-59", "60-64", "65-69", "70-74", "75-79", "80-84","85+"))

	# make a list of failures
	hilda$no.hospdays <- 1 - hilda$hospdays

	# looks alright.. compare the values of the different proportions
	# let's "aggregate" the values of x1, success, fails, sum_p according to the remaining variables
	# which are x2 and gr. Then the summation function is applied to aggregated values.
	agg<-aggregate(hilda[,c("hospdays","no.hospdays")],by=list(age.interval=hilda$age.interval),FUN=sum)
	agg$total_events <- agg$hospdays + agg$no.hospdays

	# find median for our categories
	i = 1
	agg$xmid <- NA
	for (interval in levels(hilda$age.interval)) {
	agg$xmid[i] <- median(hilda$age[hilda$age.interval == interval])
	i = i + 1
	}

	# calculate proportions
	agg$prop<-agg$hospdays/agg$total_events

	# plot the proportions
	plot(agg$age.interval, agg$prop, breaks=c(1,2,3,4,5,6,7,8),
	type="l", xlab = "age", ylab = "probability of spending at least one hospital day")

	# save
	pdf("3_2_1-proportions.pdf")
	plot(agg$age.interval, agg$prop, breaks=c(1,2,3,4,5,6,7,8),
	type="l", xlab = "age", ylab = "probability of spending at least one hospital day")


	# ========================================
	# ==============[ Q 3.2.2 ]===============
	# ========================================

	# create the continuous model
	model.age <- glm(hilda$hospdays ~ hilda$age, family = binomial)

	# create a continuous model with a second order term
	age <- hilda$age
	age.sq <- hilda$age^2
	model.age.trans <- glm(hilda$hospdays ~ age + age.sq^2, family = binomial)

	# in order to plot we make predictions for a sequence of x-values
	x1 <- seq(50,100,1)

	odds.trans <- get_log_odds(model.age.trans, cbind(x1, x1^2))
	odds.cont <- get_log_odds(model.age, cbind(x1))
	probs.trans <- odds.trans / (1 + odds.trans)
	probs.cont <- odds.cont / (1 + odds.cont)

	# plot proportions vs continous without transformation
	plot(x1, probs.cont,
	type="l",
	ylim = c(0.05,0.40),
	xlim = c(50,100),
	xlab = "age",
	ylab = "probability of spending a hospital day",
	main="Proportions of categorical age\n vs probabilities of continuous age")
	lines(agg$xmid, agg$prop, lty=2, col="blue")
	points(agg$xmid, agg$prop, pch=8, col="blue")

	# save to file
	pdf(file="3_2_2-no-transformation.pdf")
	plot(x1, probs.cont,
	type="l",
	ylim = c(0.05,0.40),
	xlim = c(50,100),
	xlab = "age",
	ylab = "probability of spending a hospital day",
	main="Proportions of categorical age\n vs probabilities of continuous age")
	lines(agg$xmid, agg$prop, lty=2, col="blue")
	points(agg$xmid, agg$prop, pch=8, col="blue")
	dev.off()

	# plot proportions vs our continuous logistic regression
	plot(x1, probs.trans,
	type="l",
	ylim = c(0.05,0.30),
	xlim = c(50,100),
	xlab = "age",
	ylab = "probability of spending a hospital day",
	main="Proportions of categorical age\nvs probabilities of continuous age (transformed)")
	lines(agg$xmid, agg$prop, lty=2, col="blue")
	points(agg$xmid, agg$prop, pch=8, col="blue")

	# save to file
	pdf(file="3_2_2-with-transformation.pdf")
	plot(x1, probs.trans,
	type="l",
	ylim = c(0.05,0.30),
	xlim = c(50,100),
	xlab = "age",
	ylab = "probability of spending a hospital day",
	main="Proportions of categorical age\nvs probabilities of continuous age (transformed)")
	lines(agg$xmid, agg$prop, lty=2, col="blue")
	points(agg$xmid, agg$prop, pch=8, col="blue")
	dev.off()

	model.current <- model.age.trans


	# ========================================
	# ==============[ Q 3.2.3 ]===============
	# ========================================

	# Hack... get the se.fit from the predict function and find someone who is 70 years old..

	index.70yo <- min(which(hilda$age == 70))
	xbhat <- predict(model.age.trans, se.fit = T)
	se.70yo <- xbhat$se.fit[index.70yo]

	summary(model.age.trans)

	lnodds.70yo <- get_log_odds(model.age.trans, c(1, 70, 70^2))

	probs.70yo <- exp(lnodds.70yo) / (1 + exp(lnodds.70yo))

	odds_lo <- lnodds.70yo + 1.96 * se.70yo
	odds_hi <- lnodds.70yo - 1.96 * se.70yo

	ci_odds_lo <- exp(odds_lo)
	ci_odds_lo
	ci_odds_hi <- exp(odds_hi)
	ci_odds_hi

	ci_probs_lo <- ci_odds_lo / (1 + ci_odds_lo)
	ci_probs_hi <- ci_odds_hi / (1 + ci_odds_hi)


	# ========================================
	# ==============[ Q 3.2.4 ]===============
	# ========================================

	# First we calculate the W matrix
	yhat <- predict(model.current, type="response")
	W <- diag(yhat * (1 - yhat))
	X <- cbind(1, age, age.sq)
	cov_matrix <- solve(t(X) %% W %% X)

	# Did we get the correct result?
	summary(model.current)$cov.unscaled
	cov_matrix

	# Indeedidoodely we did! Amazing