Skip to content

Instantly share code, notes, and snippets.

@sebnyberg
Created May 12, 2017 13:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sebnyberg/9977c2366aed841f725b26301f49c0ee to your computer and use it in GitHub Desktop.
Save sebnyberg/9977c2366aed841f725b26301f49c0ee to your computer and use it in GitHub Desktop.
library(ggplot2)
source('helpers.R')
# ======================================
# ==============[ Q 3.4 ]===============
# ======================================
# prepare the model by defining factors
load('hilda.Rdata')
hilda <- setup_data_base(hilda)
hilda <- setup_data(hilda)
model.current <- glm(hilda$hospdays ~ hilda$age + age.sq + hilda$health + hilda$sex, family=binomial)
model.full <- glm(hilda$hospdays ~ hilda$sex + hilda$brthyr + hilda$age + age.sq + hilda$civilst_rtb +
hilda$civilst + hilda$educ + hilda$child_small + hilda$child_grown +
hilda$exercise + hilda$health + hilda$work_norm + hilda$inc_hh +
hilda$inc_salary + hilda$inc_work + hilda$inc_cap + hilda$inc_pens, family=binomial)
step(model.current, scope=list(lower=model.current, upper=model.full), direction="both", steps = 1)
# exercise seems like a good category to introduce, but many did not answer
# or were not asked.. we need to inspect a subset of the entire dataset
model.new <- update(model.current, . ~ . + hilda$exercise)
compare_models(model.current, model.new)
NROW(model.new$fitted.values)
# about half the subjects were not asked about whether or not they exercise.
# we can however use those who did answer to check if there is any interesting
# relationship between exercise and hospital days
# ==============[ EXERCISE ]===============
load('hilda.Rdata')
hilda2 <- setup_data_base(hilda)
# exercise habits (0 = not asked, 8 = dont know, 9 = no answer)
# not asked has 4 entries, dont know has 0, no answer has 26
hilda2$exercise[hilda2$exercise %in% c(0,8,9)] <- NA
hilda2$exercise <- factor(hilda2$exercise,
labels = c("practically none", "sometimes" ,"once a week",
"twice a week", "at least twice"),
levels = c(1,2,3,4,5))
hilda2 <- na.omit(hilda2)
# plot histogram, looks alright
hist(as.numeric(hilda$exercise))
# lets look at the probabilities for the categories..
agg <- get_aggregate(hilda2, hilda2$exercise)
agg
plot(agg$variable, agg$props)
levels(hilda2$exercise)
hilda2$exercise
model.exercise <- glm(hilda2$hospdays ~ hilda2$exercise, family=binomial)
summary(model.exercise)
age.sq2 <- hilda2$age^2
levels(hilda2$health) <- c("good", "not good", "not good")
model.age <- glm(hilda2$hospdays ~ hilda2$age, family=binomial)
model.sex <- glm(hilda2$hospdays ~ hilda2$sex, family=binomial)
model.health <- glm(hilda2$hospdays ~ hilda2$health, family=binomial)
# not significant
model.new <- update(model.health, . ~ . + hilda2$exercise, family=binomial)
compare_models(model.health, model.new)
# significant
model.new <- update(model.age, . ~ . + hilda2$exercise, family=binomial)
compare_models(model.age, model.new)
# significant
model.new <- update(model.sex, . ~ . + hilda2$exercise, family=binomial)
compare_models(model.sex, model.new)
# what if we merge the groups to exercise yes/no?
levels(hilda2$exercise) <- c("no", rep("yes", NROW(levels(hilda2$exercise)) - 1))
agg <- get_aggregate(hilda2, hilda2$exercise)
agg
plot(agg$variable, agg$props)
# lets compare again
# significant (just barely)
model.new <- update(model.health, . ~ . + hilda2$exercise, family=binomial)
compare_models(model.health, model.new)
# significant
model.new <- update(model.age, . ~ . + hilda2$exercise, family=binomial)
compare_models(model.age, model.new)
# significant
model.new <- update(model.sex, . ~ . + hilda2$exercise, family=binomial)
compare_models(model.sex, model.new)
# what does this tell us? it does seem that exercise is correlated with health
# which makes sense..
# we need to relevel exercise so that its yes / no against good / not good health
cor(as.numeric(relevel(hilda2$exercise, ref=2)), as.numeric(hilda2$health))
# sanity check, this should be much lower...
cor(as.numeric(hilda2$exercise), as.numeric(hilda2$sex))
# ==============[ Household income ]===============
load('hilda.Rdata')
hilda <- setup_data_base(hilda)
hist(hilda$inc_hh[hilda$inc_hh <= 5000])
# form the first model by splitting the ages into groups of five
hilda$inc_hh.interval <- findInterval(hilda$inc_hh, c(500,1000,1500,2000))
hist(hilda$inc_hh.interval, breaks=c(0,1,2,3,4,5), include.lowest=T, right=F)
# first group is extremely small (141)... merge with second lowest
NROW(hilda$inc_hh[hilda$inc_hh.interval == 0])
hilda$inc_hh.interval <- findInterval(hilda$inc_hh, c(1000,1500,2000))
hist(hilda$inc_hh.interval, breaks=c(0,1,2,3,4), include.lowest=T, right=F)
# looks evenly distributed.. lets rename the categories
# rename the factors..
hilda$inc_hh.interval <- factor(hilda$inc_hh.interval,
labels=c("1-999", "1000-1499", "1500-1999", "2000+"))
agg <- get_aggregate(hilda, hilda$inc_hh.interval)
agg
plot(agg$variable, agg$props)
# although this data is continuous, it looks like it makes sense to split it into
# two halves, one for 1-1499 and one for 1500+, lets test this hypothesis
model.income <- glm(hilda$hospdays ~ hilda$inc_hh.interval, family=binomial)
summary(model.income)
# 1000-1499 is not significantly different from 0-1000..
# lets compare category 3 and 4
model.income <- glm(hilda$hospdays ~ relevel(hilda$inc_hh.interval, ref=3), family=binomial)
summary(model.income)
# as expected, 2000+ is not significantly different from 1500-1999
# lets merge and make the model (DO NOT RUN THIS TWICE)
if (NROW(levels(hilda$inc_hh.interval)) > 2){
levels(hilda$inc_hh.interval) <- c("low", "low", "high", "high")
}
model.income <- glm(hilda$hospdays ~ hilda$inc_hh.interval, family=binomial)
summary(model.income)
# now lets compare the model against other models such as age, sex, exercise
# first we merge categories as per usual
levels(hilda$health) <- c("good", "not good", "not good")
model.age <- glm(hilda$hospdays ~ hilda$age, family=binomial)
model.sex <- glm(hilda$hospdays ~ hilda$sex, family=binomial)
model.health <- glm(hilda$hospdays ~ hilda$health, family=binomial)
# significant
model.new <- update(model.health, . ~ . + hilda$inc_hh.interval, family=binomial)
compare_models(model.health, model.new)
# not significant (?)
model.new <- update(model.age, . ~ . + hilda$inc_hh.interval, family=binomial)
compare_models(model.age, model.new)
# significant
model.new <- update(model.sex, . ~ . + hilda$inc_hh.interval, family=binomial)
compare_models(model.sex, model.new)
# lets compare against a more complex model.
model.prev <- glm(hilda$hospdays ~ hilda$health + hilda$age + age.sq + hilda$sex, family=binomial)
model.new <- update(model.prev, . ~ . + hilda$inc_hh.interval, family=binomial)
compare_models(model.prev, model.new)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment