sebnyberg/3.1.R

## 3.1.R
# ========================================
# ==============[ Q 3.1.1 ]===============
# ========================================

# load data
load('hilda.Rdata')
source('helpers.R')

# redifine the hospitaldays to be 1 for anything above 1
hilda$hospdays[(hilda$hospdays >= 1)] = 1
hilda$hospdays

# redefine all answers which were "other" or "not answered" to be NA
hilda$health[(hilda$health >= 4)] = NA
hilda$health <- factor(hilda$health, labels=c("good", "bad", "inbetween"))
hilda$health

# check that all covariates are sigificantly different from zero
model.no.ref <- glm(hilda$hospdays ~ hilda$health - 1, family="binomial")
#                                      ^^^^^^ this makes categories non-relative to the reference
summary(model.no.ref)

# they are significantly different from zero
# lets try a model with healthgood as a reference category
model.ref.good <- glm(hilda$hospdays ~ hilda$health, family="binomial")
summary(model.ref.good)

# lets check if healthbad is significantly different from health inbetween
model.ref.bad <- glm(hilda$hospdays ~ factor(hilda$health, levels=c("bad","inbetween","good")), family="binomial")
summary(model.ref.bad)

# inbetween is not significantly different from bad (the reference category)...

# we should consider using a model which merges the categories.
# Lets check the ANOVA for the merged vs split model.

# create merged health
healthmerged <- hilda$health
levels(healthmerged) <- c("good", "not good", "not good")

# compare models
model.original <- glm(hilda$hospdays ~ hilda$health, family=binomial)
model.merged <- glm(hilda$hospdays ~ healthmerged, family=binomial)

# anova will use a likelihood ratio test to compare the models
anova(model.merged, model.original)
qchisq(1-0.05,1)

# since 3.444 < 3.841459 we can not reject the null hypothesis (that the two models are the same)
# thus, merging the models does not result in a loss or improvement either way
model.health <- model.merged

# merge the groups, since the Pr(>|z|) was high (0.6)
hilda$health <- healthmerged


# ========================================
# ==============[ Q 3.1.2 ]===============
# ========================================


# since the model coefficients are w.r.t. the reference category (good)
# the odds ratio for people with not good as compared to good is
ln.or.notgood <- model.health$coefficients[1]
or.notgood <- exp(ln.or.notgood)

# the S.E. is the second diagonal element in the covariance matrix
# of our model
se <- sqrt(diag(summary(model.health)$cov.unscaled))
se.or.notgood <- se[2]

# calculate bounds
lo <- ln.or.notgood - 1.96 * se.or.notgood
hi <- ln.or.notgood + 1.96 * se.or.notgood

# calculate confidence interval for
ci.or.notgood.lo <- exp(lo)
ci.or.notgood.hi <- exp(hi)


# ========================================
# ==============[ Q 3.1.3 ]===============
# ========================================

beta0 <- model.health$coefficients[1]
beta1 <- model.health$coefficients[2]

ln.odds.notgood <- beta0 + beta1
odds.notgood <- exp(ln.odds.notgood)

se.notgood <- sqrt(se[2]^2 - se[1]^2)

# calculate bounds
lo.notgood <- ln.odds.notgood - 1.96 * se.notgood
hi.notgood <- ln.odds.notgood + 1.96 * se.notgood

# calculate confidence interval for odds of not good
ci.notgood.lo <- exp(lo.notgood)
ci.notgood.hi <- exp(hi.notgood)


# ========================================
# ==============[ Q 3.1.4 ]===============
# ========================================

# probability of having at least one hospital day for people with good health:

odds.good <- exp(beta0)
prob.good <- odds.good / (1 + odds.good)

se.good <- se[1]

lo.good <- beta0 - 1.96 * se.good
hi.good <- beta0 + 1.96 * se.good

ci.prob.good.lo <- exp(lo.good) / (1 + exp(lo.good))
ci.prob.good.hi <- exp(hi.good) / (1 + exp(hi.good))

# for not good health

prob.notgood <- odds.notgood / (1 + odds.notgood)

ci.prob.notgood.lo <- exp(lo.notgood) / (1 + exp(lo.notgood))
ci.prob.notgood.hi <- exp(hi.notgood) / (1 + exp(hi.notgood))
	# ========================================
	# ==============[ Q 3.1.1 ]===============
	# ========================================

	# load data
	load('hilda.Rdata')
	source('helpers.R')

	# redifine the hospitaldays to be 1 for anything above 1
	hilda$hospdays[(hilda$hospdays >= 1)] = 1
	hilda$hospdays

	# redefine all answers which were "other" or "not answered" to be NA
	hilda$health[(hilda$health >= 4)] = NA
	hilda$health <- factor(hilda$health, labels=c("good", "bad", "inbetween"))
	hilda$health

	# check that all covariates are sigificantly different from zero
	model.no.ref <- glm(hilda$hospdays ~ hilda$health - 1, family="binomial")
	# ^^^^^^ this makes categories non-relative to the reference
	summary(model.no.ref)

	# they are significantly different from zero
	# lets try a model with healthgood as a reference category
	model.ref.good <- glm(hilda$hospdays ~ hilda$health, family="binomial")
	summary(model.ref.good)

	# lets check if healthbad is significantly different from health inbetween
	model.ref.bad <- glm(hilda$hospdays ~ factor(hilda$health, levels=c("bad","inbetween","good")), family="binomial")
	summary(model.ref.bad)

	# inbetween is not significantly different from bad (the reference category)...

	# we should consider using a model which merges the categories.
	# Lets check the ANOVA for the merged vs split model.

	# create merged health
	healthmerged <- hilda$health
	levels(healthmerged) <- c("good", "not good", "not good")

	# compare models
	model.original <- glm(hilda$hospdays ~ hilda$health, family=binomial)
	model.merged <- glm(hilda$hospdays ~ healthmerged, family=binomial)

	# anova will use a likelihood ratio test to compare the models
	anova(model.merged, model.original)
	qchisq(1-0.05,1)

	# since 3.444 < 3.841459 we can not reject the null hypothesis (that the two models are the same)
	# thus, merging the models does not result in a loss or improvement either way
	model.health <- model.merged

	# merge the groups, since the Pr(>\|z\|) was high (0.6)
	hilda$health <- healthmerged


	# ========================================
	# ==============[ Q 3.1.2 ]===============
	# ========================================


	# since the model coefficients are w.r.t. the reference category (good)
	# the odds ratio for people with not good as compared to good is
	ln.or.notgood <- model.health$coefficients[1]
	or.notgood <- exp(ln.or.notgood)

	# the S.E. is the second diagonal element in the covariance matrix
	# of our model
	se <- sqrt(diag(summary(model.health)$cov.unscaled))
	se.or.notgood <- se[2]

	# calculate bounds
	lo <- ln.or.notgood - 1.96 * se.or.notgood
	hi <- ln.or.notgood + 1.96 * se.or.notgood

	# calculate confidence interval for
	ci.or.notgood.lo <- exp(lo)
	ci.or.notgood.hi <- exp(hi)


	# ========================================
	# ==============[ Q 3.1.3 ]===============
	# ========================================

	beta0 <- model.health$coefficients[1]
	beta1 <- model.health$coefficients[2]

	ln.odds.notgood <- beta0 + beta1
	odds.notgood <- exp(ln.odds.notgood)

	se.notgood <- sqrt(se[2]^2 - se[1]^2)

	# calculate bounds
	lo.notgood <- ln.odds.notgood - 1.96 * se.notgood
	hi.notgood <- ln.odds.notgood + 1.96 * se.notgood

	# calculate confidence interval for odds of not good
	ci.notgood.lo <- exp(lo.notgood)
	ci.notgood.hi <- exp(hi.notgood)



	# ========================================
	# ==============[ Q 3.1.4 ]===============
	# ========================================

	# probability of having at least one hospital day for people with good health:

	odds.good <- exp(beta0)
	prob.good <- odds.good / (1 + odds.good)

	se.good <- se[1]

	lo.good <- beta0 - 1.96 * se.good
	hi.good <- beta0 + 1.96 * se.good

	ci.prob.good.lo <- exp(lo.good) / (1 + exp(lo.good))
	ci.prob.good.hi <- exp(hi.good) / (1 + exp(hi.good))

	# for not good health

	prob.notgood <- odds.notgood / (1 + odds.notgood)

	ci.prob.notgood.lo <- exp(lo.notgood) / (1 + exp(lo.notgood))
	ci.prob.notgood.hi <- exp(hi.notgood) / (1 + exp(hi.notgood))