Skip to content

Instantly share code, notes, and snippets.

@sebnyberg
Created May 11, 2017 15:01
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sebnyberg/0b4d83c97334227f350587a7dc2d7b61 to your computer and use it in GitHub Desktop.
Save sebnyberg/0b4d83c97334227f350587a7dc2d7b61 to your computer and use it in GitHub Desktop.
# ========================================
# ==============[ Q 3.2.1 ]===============
# ========================================
load('hilda.Rdata')
source('helpers.R')
hilda <- setup_data_base(hilda)
# form the first model by splitting the ages into groups of five
hilda$age.interval <- findInterval(hilda$age, c(50,55,60,65,70,75,80,85,90,95))
# compare number of samples in the intervals
hist(hilda$age.interval, breaks = c(1,2,3,4,5,6,7,8,9,10),
main="Number of people per age interval",
xlab="Age interval")
# save
pdf(file="3_2_1-hist-10categories.pdf")
hist(hilda$age.interval, breaks = c(1,2,3,4,5,6,7,8,9,10),
main="Number of people per age interval",
xlab="Age interval")
dev.off()
# merge last two categories for a more even distribution of data
hilda$age.interval <- findInterval(hilda$age, c(50,55,60,65,70,75,80,85))
# compare number of samples in the intervals
hist(hilda$age.interval, breaks = c(1,2,3,4,5,6,7,8),
main="Number of people per age interval",
xlab="Age interval")
# save
pdf(file="3_2_1-hist-8categories.pdf")
hist(hilda$age.interval, breaks = c(1,2,3,4,5,6,7,8),
main="Number of people per age interval",
xlab="Age interval")
dev.off()
# rename the factors..
hilda$age.interval <- factor(hilda$age.interval,
labels=c("50-54", "55-59", "60-64", "65-69", "70-74", "75-79", "80-84","85+"))
# make a list of failures
hilda$no.hospdays <- 1 - hilda$hospdays
# looks alright.. compare the values of the different proportions
# let's "aggregate" the values of x1, success, fails, sum_p according to the remaining variables
# which are x2 and gr. Then the summation function is applied to aggregated values.
agg<-aggregate(hilda[,c("hospdays","no.hospdays")],by=list(age.interval=hilda$age.interval),FUN=sum)
agg$total_events <- agg$hospdays + agg$no.hospdays
# find median for our categories
i = 1
agg$xmid <- NA
for (interval in levels(hilda$age.interval)) {
agg$xmid[i] <- median(hilda$age[hilda$age.interval == interval])
i = i + 1
}
# calculate proportions
agg$prop<-agg$hospdays/agg$total_events
# plot the proportions
plot(agg$age.interval, agg$prop, breaks=c(1,2,3,4,5,6,7,8),
type="l", xlab = "age", ylab = "probability of spending at least one hospital day")
# save
pdf("3_2_1-proportions.pdf")
plot(agg$age.interval, agg$prop, breaks=c(1,2,3,4,5,6,7,8),
type="l", xlab = "age", ylab = "probability of spending at least one hospital day")
# ========================================
# ==============[ Q 3.2.2 ]===============
# ========================================
# create the continuous model
model.age <- glm(hilda$hospdays ~ hilda$age, family = binomial)
# create a continuous model with a second order term
age <- hilda$age
age.sq <- hilda$age^2
model.age.trans <- glm(hilda$hospdays ~ age + age.sq^2, family = binomial)
# in order to plot we make predictions for a sequence of x-values
x1 <- seq(50,100,1)
odds.trans <- get_log_odds(model.age.trans, cbind(x1, x1^2))
odds.cont <- get_log_odds(model.age, cbind(x1))
probs.trans <- odds.trans / (1 + odds.trans)
probs.cont <- odds.cont / (1 + odds.cont)
# plot proportions vs continous without transformation
plot(x1, probs.cont,
type="l",
ylim = c(0.05,0.40),
xlim = c(50,100),
xlab = "age",
ylab = "probability of spending a hospital day",
main="Proportions of categorical age\n vs probabilities of continuous age")
lines(agg$xmid, agg$prop, lty=2, col="blue")
points(agg$xmid, agg$prop, pch=8, col="blue")
# save to file
pdf(file="3_2_2-no-transformation.pdf")
plot(x1, probs.cont,
type="l",
ylim = c(0.05,0.40),
xlim = c(50,100),
xlab = "age",
ylab = "probability of spending a hospital day",
main="Proportions of categorical age\n vs probabilities of continuous age")
lines(agg$xmid, agg$prop, lty=2, col="blue")
points(agg$xmid, agg$prop, pch=8, col="blue")
dev.off()
# plot proportions vs our continuous logistic regression
plot(x1, probs.trans,
type="l",
ylim = c(0.05,0.30),
xlim = c(50,100),
xlab = "age",
ylab = "probability of spending a hospital day",
main="Proportions of categorical age\nvs probabilities of continuous age (transformed)")
lines(agg$xmid, agg$prop, lty=2, col="blue")
points(agg$xmid, agg$prop, pch=8, col="blue")
# save to file
pdf(file="3_2_2-with-transformation.pdf")
plot(x1, probs.trans,
type="l",
ylim = c(0.05,0.30),
xlim = c(50,100),
xlab = "age",
ylab = "probability of spending a hospital day",
main="Proportions of categorical age\nvs probabilities of continuous age (transformed)")
lines(agg$xmid, agg$prop, lty=2, col="blue")
points(agg$xmid, agg$prop, pch=8, col="blue")
dev.off()
model.current <- model.age.trans
# ========================================
# ==============[ Q 3.2.3 ]===============
# ========================================
# Hack... get the se.fit from the predict function and find someone who is 70 years old..
index.70yo <- min(which(hilda$age == 70))
xbhat <- predict(model.age.trans, se.fit = T)
se.70yo <- xbhat$se.fit[index.70yo]
summary(model.age.trans)
lnodds.70yo <- get_log_odds(model.age.trans, c(1, 70, 70^2))
probs.70yo <- exp(lnodds.70yo) / (1 + exp(lnodds.70yo))
odds_lo <- lnodds.70yo + 1.96 * se.70yo
odds_hi <- lnodds.70yo - 1.96 * se.70yo
ci_odds_lo <- exp(odds_lo)
ci_odds_lo
ci_odds_hi <- exp(odds_hi)
ci_odds_hi
ci_probs_lo <- ci_odds_lo / (1 + ci_odds_lo)
ci_probs_hi <- ci_odds_hi / (1 + ci_odds_hi)
# ========================================
# ==============[ Q 3.2.4 ]===============
# ========================================
# First we calculate the W matrix
yhat <- predict(model.current, type="response")
W <- diag(yhat * (1 - yhat))
X <- cbind(1, age, age.sq)
cov_matrix <- solve(t(X) %*% W %*% X)
# Did we get the correct result?
summary(model.current)$cov.unscaled
cov_matrix
# Indeedidoodely we did! Amazing
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment