sebnyberg/3.4.R

## 3.4.R
library(ggplot2)
source('helpers.R')


# ======================================
# ==============[ Q 3.4 ]===============
# ======================================

# prepare the model by defining factors
load('hilda.Rdata')
hilda <- setup_data_base(hilda)
hilda <- setup_data(hilda)


model.current <- glm(hilda$hospdays ~ hilda$age + age.sq + hilda$health + hilda$sex, family=binomial)
model.full <- glm(hilda$hospdays ~ hilda$sex + hilda$brthyr + hilda$age + age.sq + hilda$civilst_rtb +
                    hilda$civilst + hilda$educ + hilda$child_small + hilda$child_grown +
                    hilda$exercise + hilda$health + hilda$work_norm + hilda$inc_hh +
                    hilda$inc_salary + hilda$inc_work + hilda$inc_cap + hilda$inc_pens, family=binomial)

step(model.current, scope=list(lower=model.current, upper=model.full), direction="both", steps = 1)


# exercise seems like a good category to introduce, but many did not answer
# or were not asked.. we need to inspect a subset of the entire dataset
model.new <- update(model.current, . ~ . + hilda$exercise)
compare_models(model.current, model.new)
NROW(model.new$fitted.values)


# about half the subjects were not asked about whether or not they exercise.
# we can however use those who did answer to check if there is any interesting
# relationship between exercise and hospital days

# ==============[ EXERCISE ]===============


load('hilda.Rdata')
hilda2 <- setup_data_base(hilda)

# exercise habits (0 = not asked, 8 = dont know, 9 = no answer)
# not asked has 4 entries, dont know has 0, no answer has 26
hilda2$exercise[hilda2$exercise %in% c(0,8,9)] <- NA
hilda2$exercise <- factor(hilda2$exercise,
                          labels = c("practically none", "sometimes" ,"once a week",
                                     "twice a week", "at least twice"),
                          levels = c(1,2,3,4,5))


hilda2 <- na.omit(hilda2)

# plot histogram, looks alright
hist(as.numeric(hilda$exercise))

# lets look at the probabilities for the categories..
agg <- get_aggregate(hilda2, hilda2$exercise)
agg
plot(agg$variable, agg$props)

levels(hilda2$exercise)
hilda2$exercise

model.exercise <- glm(hilda2$hospdays ~ hilda2$exercise, family=binomial)
summary(model.exercise)

age.sq2 <- hilda2$age^2
levels(hilda2$health) <- c("good", "not good", "not good")

model.age <- glm(hilda2$hospdays ~ hilda2$age, family=binomial)
model.sex <- glm(hilda2$hospdays ~ hilda2$sex, family=binomial)
model.health <- glm(hilda2$hospdays ~ hilda2$health, family=binomial)

# not significant
model.new <- update(model.health, . ~ . + hilda2$exercise, family=binomial)
compare_models(model.health, model.new)

# significant
model.new <- update(model.age, . ~ . + hilda2$exercise, family=binomial)
compare_models(model.age, model.new)

# significant
model.new <- update(model.sex, . ~ . + hilda2$exercise, family=binomial)
compare_models(model.sex, model.new)

# what if we merge the groups to exercise yes/no?
levels(hilda2$exercise) <- c("no", rep("yes", NROW(levels(hilda2$exercise)) - 1))
agg <- get_aggregate(hilda2, hilda2$exercise)
agg
plot(agg$variable, agg$props)

# lets compare again

# significant (just barely)
model.new <- update(model.health, . ~ . + hilda2$exercise, family=binomial)
compare_models(model.health, model.new)

# significant
model.new <- update(model.age, . ~ . + hilda2$exercise, family=binomial)
compare_models(model.age, model.new)

# significant
model.new <- update(model.sex, . ~ . + hilda2$exercise, family=binomial)
compare_models(model.sex, model.new)

# what does this tell us? it does seem that exercise is correlated with health
# which makes sense..

# we need to relevel exercise so that its yes / no against good / not good health
cor(as.numeric(relevel(hilda2$exercise, ref=2)), as.numeric(hilda2$health))

# sanity check, this should be much lower...
cor(as.numeric(hilda2$exercise), as.numeric(hilda2$sex))


# ==============[ Household income ]===============

load('hilda.Rdata')
hilda <- setup_data_base(hilda)


hist(hilda$inc_hh[hilda$inc_hh <= 5000])

# form the first model by splitting the ages into groups of five
hilda$inc_hh.interval <- findInterval(hilda$inc_hh, c(500,1000,1500,2000))
hist(hilda$inc_hh.interval, breaks=c(0,1,2,3,4,5), include.lowest=T, right=F)

# first group is extremely small (141)... merge with second lowest
NROW(hilda$inc_hh[hilda$inc_hh.interval == 0])
hilda$inc_hh.interval <- findInterval(hilda$inc_hh, c(1000,1500,2000))
hist(hilda$inc_hh.interval, breaks=c(0,1,2,3,4), include.lowest=T, right=F)

# looks evenly distributed.. lets rename the categories

# rename the factors..
hilda$inc_hh.interval <- factor(hilda$inc_hh.interval,
                                labels=c("1-999", "1000-1499", "1500-1999", "2000+"))

agg <- get_aggregate(hilda, hilda$inc_hh.interval)
agg
plot(agg$variable, agg$props)


# although this data is continuous, it looks like it makes sense to split it into
# two halves, one for 1-1499 and one for 1500+, lets test this hypothesis

model.income <- glm(hilda$hospdays ~ hilda$inc_hh.interval, family=binomial)
summary(model.income)

# 1000-1499 is not significantly different from 0-1000..

# lets compare category 3 and 4
model.income <- glm(hilda$hospdays ~ relevel(hilda$inc_hh.interval, ref=3), family=binomial)
summary(model.income)
# as expected, 2000+ is not significantly different from 1500-1999

# lets merge and make the model (DO NOT RUN THIS TWICE)
if (NROW(levels(hilda$inc_hh.interval)) > 2){
  levels(hilda$inc_hh.interval) <- c("low", "low", "high", "high")
}
model.income <- glm(hilda$hospdays ~ hilda$inc_hh.interval, family=binomial)
summary(model.income)

# now lets compare the model against other models such as age, sex, exercise

# first we merge categories as per usual
levels(hilda$health) <- c("good", "not good", "not good")

model.age <- glm(hilda$hospdays ~ hilda$age, family=binomial)
model.sex <- glm(hilda$hospdays ~ hilda$sex, family=binomial)
model.health <- glm(hilda$hospdays ~ hilda$health, family=binomial)

# significant
model.new <- update(model.health, . ~ . + hilda$inc_hh.interval, family=binomial)
compare_models(model.health, model.new)

# not significant (?)
model.new <- update(model.age, . ~ . + hilda$inc_hh.interval, family=binomial)
compare_models(model.age, model.new)

# significant
model.new <- update(model.sex, . ~ . + hilda$inc_hh.interval, family=binomial)
compare_models(model.sex, model.new)

# lets compare against a more complex model.

model.prev <- glm(hilda$hospdays ~ hilda$health + hilda$age + age.sq + hilda$sex, family=binomial)
model.new <- update(model.prev, . ~ . + hilda$inc_hh.interval, family=binomial)
compare_models(model.prev, model.new)
	library(ggplot2)
	source('helpers.R')



	# ======================================
	# ==============[ Q 3.4 ]===============
	# ======================================

	# prepare the model by defining factors
	load('hilda.Rdata')
	hilda <- setup_data_base(hilda)
	hilda <- setup_data(hilda)


	model.current <- glm(hilda$hospdays ~ hilda$age + age.sq + hilda$health + hilda$sex, family=binomial)
	model.full <- glm(hilda$hospdays ~ hilda$sex + hilda$brthyr + hilda$age + age.sq + hilda$civilst_rtb +
	hilda$civilst + hilda$educ + hilda$child_small + hilda$child_grown +
	hilda$exercise + hilda$health + hilda$work_norm + hilda$inc_hh +
	hilda$inc_salary + hilda$inc_work + hilda$inc_cap + hilda$inc_pens, family=binomial)

	step(model.current, scope=list(lower=model.current, upper=model.full), direction="both", steps = 1)


	# exercise seems like a good category to introduce, but many did not answer
	# or were not asked.. we need to inspect a subset of the entire dataset
	model.new <- update(model.current, . ~ . + hilda$exercise)
	compare_models(model.current, model.new)
	NROW(model.new$fitted.values)


	# about half the subjects were not asked about whether or not they exercise.
	# we can however use those who did answer to check if there is any interesting
	# relationship between exercise and hospital days

	# ==============[ EXERCISE ]===============


	load('hilda.Rdata')
	hilda2 <- setup_data_base(hilda)

	# exercise habits (0 = not asked, 8 = dont know, 9 = no answer)
	# not asked has 4 entries, dont know has 0, no answer has 26
	hilda2$exercise[hilda2$exercise %in% c(0,8,9)] <- NA
	hilda2$exercise <- factor(hilda2$exercise,
	labels = c("practically none", "sometimes" ,"once a week",
	"twice a week", "at least twice"),
	levels = c(1,2,3,4,5))


	hilda2 <- na.omit(hilda2)

	# plot histogram, looks alright
	hist(as.numeric(hilda$exercise))

	# lets look at the probabilities for the categories..
	agg <- get_aggregate(hilda2, hilda2$exercise)
	agg
	plot(agg$variable, agg$props)

	levels(hilda2$exercise)
	hilda2$exercise

	model.exercise <- glm(hilda2$hospdays ~ hilda2$exercise, family=binomial)
	summary(model.exercise)

	age.sq2 <- hilda2$age^2
	levels(hilda2$health) <- c("good", "not good", "not good")

	model.age <- glm(hilda2$hospdays ~ hilda2$age, family=binomial)
	model.sex <- glm(hilda2$hospdays ~ hilda2$sex, family=binomial)
	model.health <- glm(hilda2$hospdays ~ hilda2$health, family=binomial)

	# not significant
	model.new <- update(model.health, . ~ . + hilda2$exercise, family=binomial)
	compare_models(model.health, model.new)

	# significant
	model.new <- update(model.age, . ~ . + hilda2$exercise, family=binomial)
	compare_models(model.age, model.new)

	# significant
	model.new <- update(model.sex, . ~ . + hilda2$exercise, family=binomial)
	compare_models(model.sex, model.new)

	# what if we merge the groups to exercise yes/no?
	levels(hilda2$exercise) <- c("no", rep("yes", NROW(levels(hilda2$exercise)) - 1))
	agg <- get_aggregate(hilda2, hilda2$exercise)
	agg
	plot(agg$variable, agg$props)

	# lets compare again

	# significant (just barely)
	model.new <- update(model.health, . ~ . + hilda2$exercise, family=binomial)
	compare_models(model.health, model.new)

	# significant
	model.new <- update(model.age, . ~ . + hilda2$exercise, family=binomial)
	compare_models(model.age, model.new)

	# significant
	model.new <- update(model.sex, . ~ . + hilda2$exercise, family=binomial)
	compare_models(model.sex, model.new)

	# what does this tell us? it does seem that exercise is correlated with health
	# which makes sense..

	# we need to relevel exercise so that its yes / no against good / not good health
	cor(as.numeric(relevel(hilda2$exercise, ref=2)), as.numeric(hilda2$health))

	# sanity check, this should be much lower...
	cor(as.numeric(hilda2$exercise), as.numeric(hilda2$sex))



	# ==============[ Household income ]===============

	load('hilda.Rdata')
	hilda <- setup_data_base(hilda)


	hist(hilda$inc_hh[hilda$inc_hh <= 5000])

	# form the first model by splitting the ages into groups of five
	hilda$inc_hh.interval <- findInterval(hilda$inc_hh, c(500,1000,1500,2000))
	hist(hilda$inc_hh.interval, breaks=c(0,1,2,3,4,5), include.lowest=T, right=F)

	# first group is extremely small (141)... merge with second lowest
	NROW(hilda$inc_hh[hilda$inc_hh.interval == 0])
	hilda$inc_hh.interval <- findInterval(hilda$inc_hh, c(1000,1500,2000))
	hist(hilda$inc_hh.interval, breaks=c(0,1,2,3,4), include.lowest=T, right=F)

	# looks evenly distributed.. lets rename the categories

	# rename the factors..
	hilda$inc_hh.interval <- factor(hilda$inc_hh.interval,
	labels=c("1-999", "1000-1499", "1500-1999", "2000+"))

	agg <- get_aggregate(hilda, hilda$inc_hh.interval)
	agg
	plot(agg$variable, agg$props)


	# although this data is continuous, it looks like it makes sense to split it into
	# two halves, one for 1-1499 and one for 1500+, lets test this hypothesis

	model.income <- glm(hilda$hospdays ~ hilda$inc_hh.interval, family=binomial)
	summary(model.income)

	# 1000-1499 is not significantly different from 0-1000..

	# lets compare category 3 and 4
	model.income <- glm(hilda$hospdays ~ relevel(hilda$inc_hh.interval, ref=3), family=binomial)
	summary(model.income)
	# as expected, 2000+ is not significantly different from 1500-1999

	# lets merge and make the model (DO NOT RUN THIS TWICE)
	if (NROW(levels(hilda$inc_hh.interval)) > 2){
	levels(hilda$inc_hh.interval) <- c("low", "low", "high", "high")
	}
	model.income <- glm(hilda$hospdays ~ hilda$inc_hh.interval, family=binomial)
	summary(model.income)

	# now lets compare the model against other models such as age, sex, exercise

	# first we merge categories as per usual
	levels(hilda$health) <- c("good", "not good", "not good")

	model.age <- glm(hilda$hospdays ~ hilda$age, family=binomial)
	model.sex <- glm(hilda$hospdays ~ hilda$sex, family=binomial)
	model.health <- glm(hilda$hospdays ~ hilda$health, family=binomial)

	# significant
	model.new <- update(model.health, . ~ . + hilda$inc_hh.interval, family=binomial)
	compare_models(model.health, model.new)

	# not significant (?)
	model.new <- update(model.age, . ~ . + hilda$inc_hh.interval, family=binomial)
	compare_models(model.age, model.new)

	# significant
	model.new <- update(model.sex, . ~ . + hilda$inc_hh.interval, family=binomial)
	compare_models(model.sex, model.new)

	# lets compare against a more complex model.

	model.prev <- glm(hilda$hospdays ~ hilda$health + hilda$age + age.sq + hilda$sex, family=binomial)
	model.new <- update(model.prev, . ~ . + hilda$inc_hh.interval, family=binomial)
	compare_models(model.prev, model.new)