Skip to content

Instantly share code, notes, and snippets.

@sebnyberg
Created May 11, 2017 15:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sebnyberg/1b6588d9730f5380cdd26e6b7b403a7a to your computer and use it in GitHub Desktop.
Save sebnyberg/1b6588d9730f5380cdd26e6b7b403a7a to your computer and use it in GitHub Desktop.
library(ggplot2)
# returns the odds of a model given an input vector x
get_log_odds <- function(model,x) {
n_coeff <- length(model$coefficients)
if (n_coeff - length(x) == 1) {
# user forgot the first 1 in [1, x1,...]
x <- c(1,x)
} else if (length(x) > n_coeff + 1
&& n_coeff - NCOL(x) == 1) {
x <- cbind(1, x)
}
if (n_coeff != NCOL(x) &&
n_coeff != length(x)) {
expected_n <- n_coeff - 1
print(noquote(c("X is not of correct length, each row should be of size", expected_n)))
return(NA)
}
# calculate the odds
log_odds = x %*% model$coefficients
return(log_odds)
}
# creates an aggregate table of proportions for some variable
get_aggregate <- function(dataset, list) {
if (max(dataset$hospdays > 1)) {
dataset$hospdays[dataset$hospdays >= 1] <- 1
}
if (!(c("no.hospdays") %in% colnames(dataset))) {
dataset$no.hospdays <- 1 - dataset$hospdays
}
res <- aggregate(dataset[,c("hospdays","no.hospdays")],by=list(variable=list),FUN=sum)
res$total <- res$hospdays + res$no.hospdays
res$props <- res$hospdays / res$total
return(res)
}
# compares two logistic models by calculating the log likelihood
# and testing against the chi squared distribution at 95%
compare_models <- function(current, new) {
d.new <- -2*logLik(new)
d.current <- -2*logLik(current)
d.diff <- d.current - d.new
result <- NA
retval <- NA
qquant <- NA
df = NROW(new$coefficients) - NROW(current$coefficients)
if (df < 1) {
print(noquote("cannot use chisq since df is the same or lower"))
}
else {
qquant <- qchisq(1-0.05,df)
if (d.diff > qquant) {
result <- "Yes"
retval <- TRUE
} else {
result <- "No"
retval <- FALSE
}
}
result.frame = data.frame(d.new = c(d.new),
d.current = c(d.current),
d.diff = c(d.diff),
df = c(df),
qquant = c(qquant),
reject.h0 = c(result))
print(result.frame)
return(retval)
}
setup_data_base <- function(data) {
# redefine the hospitaldays to be 1 for anything above 1
data$hospdays[(data$hospdays >= 1)] = 1
# count no.hospdays to be the complement of hospdays
data$no.hospdays = 1 - data$hospdays
# define sex as a factor
data$sex <- factor(data$sex, labels=c("male", "female"), levels=c(1,2))
# redefine all answers which were "other" or "not answered" to be NA
data$health[(data$health >= 4)] = NA
data$health <- factor(data$health, labels=c("good", "bad", "inbetween"))
return(data)
}
setup_data <- function(data) {
#load('hilda.Rdata', envir=globalenv())
#hildacopy <- hilda
data <- setup_data_base(data)
# civilst from registry
data$civilst_rtb <- factor(data$civilst_rtb,
labels = c("unmarried", "married man",
"married woman living apart",
"divorced", "widow/widower", "dead",
"married womand with husband", "child<18",
"foster child<18"),
levels = c(1,2,3,4,5,6,7,8,9))
# civilst in interview (9 = no answer)
data$civilst[data$civilst == 9] <- NA
data$civilst <- factor(data$civilst,
labels = c("unmarried", "married",
"divorced/separated", "widow/widower"),
levels = c(1,2,3,4))
# education (0 = not asked)
data$educ[data$educ == 0] <- NA
data$educ <- factor(data$educ,
labels = c("pregymnasial<9", "pregymnasial9-10",
"gymnasial<2", "gymnasial2-3", "post-gymnasial<3",
"post-gymnasial>3", "post-graduate"),
levels = c(1,2,3,4,5,6,7))
# exercise habits (0 = not asked, 8 = dont know, 9 = no answer)
# not asked has 4 entries, dont know has 0, no answer has 26
data$exercise[data$exercise %in% c(0,8,9)] <- NA
data$exercise <- factor(data$exercise,
labels = c("practically none", "sometimes" ,"once a week",
"twice a week", "at least twice"),
levels = c(1,2,3,4,5))
# normal hours of work per week
data$work_norm <- factor(data$work_norm,
labels = c("1-19hr", "20-34hr", "35-97hr",
"farmer or self-employed", "other/doesntwork/dontknow"),
levels = c(1,2,3,4,5))
# exercise habits (98 = dont know, 99 = no answer)
data$work_week[data$work_week %in% c(98,99)] <- NA
# income (999999 = missing)
data$inc_hh[data$inc_hh == 999999] <- NA
data$inc_salary[data$inc_salary == 999999] <- NA
data$inc_cap[data$inc_cap == 999999] <- NA
data$inc_work[data$inc_work == 999999] <- NA
data$inc_pens[data$inc_pens == 999999] <- NA
return(data)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment