sebnyberg/helpers.R

## helpers.R

library(ggplot2)

# returns the odds of a model given an input vector x
get_log_odds <- function(model,x) {
  n_coeff <- length(model$coefficients)

  if (n_coeff - length(x) == 1) {
    # user forgot the first 1 in [1, x1,...]
    x <- c(1,x)
  } else if (length(x) > n_coeff + 1
             && n_coeff - NCOL(x) == 1) {
    x <- cbind(1, x)
  }

  if (n_coeff != NCOL(x) &&
      n_coeff != length(x)) {
    expected_n <- n_coeff - 1
    print(noquote(c("X is not of correct length, each row should be of size", expected_n)))
    return(NA)
  }

  # calculate the odds
  log_odds = x %*% model$coefficients

  return(log_odds)
}

# creates an aggregate table of proportions for some variable
get_aggregate <- function(dataset, list) {

  if (max(dataset$hospdays > 1)) {
    dataset$hospdays[dataset$hospdays >= 1] <- 1
  }
  if (!(c("no.hospdays") %in% colnames(dataset))) {
    dataset$no.hospdays <- 1 - dataset$hospdays
  }

  res <- aggregate(dataset[,c("hospdays","no.hospdays")],by=list(variable=list),FUN=sum)

  res$total <- res$hospdays + res$no.hospdays
  res$props <- res$hospdays / res$total

  return(res)
}


# compares two logistic models by calculating the log likelihood
# and testing against the chi squared distribution at 95%
compare_models <- function(current, new) {

  d.new <- -2*logLik(new)
  d.current <- -2*logLik(current)

  d.diff <- d.current - d.new

  result <- NA
  retval <- NA
  qquant <- NA

  df = NROW(new$coefficients) - NROW(current$coefficients)

  if (df < 1) {
    print(noquote("cannot use chisq since df is the same or lower"))
  }
  else {
    qquant <- qchisq(1-0.05,df)

    if (d.diff > qquant) {
      result <- "Yes"
      retval <- TRUE
    } else {
      result <- "No"
      retval <- FALSE
    }
  }

  result.frame = data.frame(d.new = c(d.new),
                            d.current = c(d.current),
                            d.diff = c(d.diff),
                            df = c(df),
                            qquant = c(qquant),
                            reject.h0 = c(result))

  print(result.frame)

  return(retval)
}


setup_data_base <- function(data) {
  # redefine the hospitaldays to be 1 for anything above 1
  data$hospdays[(data$hospdays >= 1)] = 1

  # count no.hospdays to be the complement of hospdays
  data$no.hospdays = 1 - data$hospdays

  # define sex as a factor
  data$sex <- factor(data$sex, labels=c("male", "female"), levels=c(1,2))


  # redefine all answers which were "other" or "not answered" to be NA
  data$health[(data$health >= 4)] = NA
  data$health <- factor(data$health, labels=c("good", "bad", "inbetween"))

  return(data)
}


setup_data <- function(data) {
  #load('hilda.Rdata', envir=globalenv())

  #hildacopy <- hilda

  data <- setup_data_base(data)


  # civilst from registry
  data$civilst_rtb <- factor(data$civilst_rtb,
                             labels = c("unmarried", "married man",
                                        "married woman living apart",
                                        "divorced", "widow/widower", "dead",
                                        "married womand with husband", "child<18",
                                        "foster child<18"),
                             levels = c(1,2,3,4,5,6,7,8,9))

  # civilst in interview (9 = no answer)
  data$civilst[data$civilst == 9] <- NA
  data$civilst <- factor(data$civilst,
                         labels = c("unmarried", "married",
                                    "divorced/separated", "widow/widower"),
                         levels = c(1,2,3,4))

  # education (0 = not asked)
  data$educ[data$educ == 0] <- NA
  data$educ <- factor(data$educ,
                      labels = c("pregymnasial<9", "pregymnasial9-10",
                                 "gymnasial<2", "gymnasial2-3", "post-gymnasial<3",
                                 "post-gymnasial>3", "post-graduate"),
                      levels = c(1,2,3,4,5,6,7))

  # exercise habits (0 = not asked, 8 = dont know, 9 = no answer)
  # not asked has 4 entries, dont know has 0, no answer has 26
  data$exercise[data$exercise %in% c(0,8,9)] <- NA
  data$exercise <- factor(data$exercise,
                          labels = c("practically none", "sometimes" ,"once a week",
                                     "twice a week", "at least twice"),
                          levels = c(1,2,3,4,5))

  # normal hours of work per week
  data$work_norm <- factor(data$work_norm,
                           labels = c("1-19hr", "20-34hr", "35-97hr",
                                      "farmer or self-employed", "other/doesntwork/dontknow"),
                           levels = c(1,2,3,4,5))

  # exercise habits (98 = dont know, 99 = no answer)
  data$work_week[data$work_week %in% c(98,99)] <- NA

  # income (999999 = missing)
  data$inc_hh[data$inc_hh == 999999] <- NA
  data$inc_salary[data$inc_salary == 999999] <- NA
  data$inc_cap[data$inc_cap == 999999] <- NA
  data$inc_work[data$inc_work == 999999] <- NA
  data$inc_pens[data$inc_pens == 999999] <- NA

  return(data)
}

	library(ggplot2)

	# returns the odds of a model given an input vector x
	get_log_odds <- function(model,x) {
	n_coeff <- length(model$coefficients)

	if (n_coeff - length(x) == 1) {
	# user forgot the first 1 in [1, x1,...]
	x <- c(1,x)
	} else if (length(x) > n_coeff + 1
	&& n_coeff - NCOL(x) == 1) {
	x <- cbind(1, x)
	}

	if (n_coeff != NCOL(x) &&
	n_coeff != length(x)) {
	expected_n <- n_coeff - 1
	print(noquote(c("X is not of correct length, each row should be of size", expected_n)))
	return(NA)
	}

	# calculate the odds
	log_odds = x %*% model$coefficients

	return(log_odds)
	}

	# creates an aggregate table of proportions for some variable
	get_aggregate <- function(dataset, list) {

	if (max(dataset$hospdays > 1)) {
	dataset$hospdays[dataset$hospdays >= 1] <- 1
	}
	if (!(c("no.hospdays") %in% colnames(dataset))) {
	dataset$no.hospdays <- 1 - dataset$hospdays
	}

	res <- aggregate(dataset[,c("hospdays","no.hospdays")],by=list(variable=list),FUN=sum)

	res$total <- res$hospdays + res$no.hospdays
	res$props <- res$hospdays / res$total

	return(res)
	}



	# compares two logistic models by calculating the log likelihood
	# and testing against the chi squared distribution at 95%
	compare_models <- function(current, new) {

	d.new <- -2*logLik(new)
	d.current <- -2*logLik(current)

	d.diff <- d.current - d.new

	result <- NA
	retval <- NA
	qquant <- NA

	df = NROW(new$coefficients) - NROW(current$coefficients)

	if (df < 1) {
	print(noquote("cannot use chisq since df is the same or lower"))
	}
	else {
	qquant <- qchisq(1-0.05,df)

	if (d.diff > qquant) {
	result <- "Yes"
	retval <- TRUE
	} else {
	result <- "No"
	retval <- FALSE
	}
	}

	result.frame = data.frame(d.new = c(d.new),
	d.current = c(d.current),
	d.diff = c(d.diff),
	df = c(df),
	qquant = c(qquant),
	reject.h0 = c(result))

	print(result.frame)

	return(retval)
	}


	setup_data_base <- function(data) {
	# redefine the hospitaldays to be 1 for anything above 1
	data$hospdays[(data$hospdays >= 1)] = 1

	# count no.hospdays to be the complement of hospdays
	data$no.hospdays = 1 - data$hospdays

	# define sex as a factor
	data$sex <- factor(data$sex, labels=c("male", "female"), levels=c(1,2))


	# redefine all answers which were "other" or "not answered" to be NA
	data$health[(data$health >= 4)] = NA
	data$health <- factor(data$health, labels=c("good", "bad", "inbetween"))

	return(data)
	}


	setup_data <- function(data) {
	#load('hilda.Rdata', envir=globalenv())

	#hildacopy <- hilda

	data <- setup_data_base(data)


	# civilst from registry
	data$civilst_rtb <- factor(data$civilst_rtb,
	labels = c("unmarried", "married man",
	"married woman living apart",
	"divorced", "widow/widower", "dead",
	"married womand with husband", "child<18",
	"foster child<18"),
	levels = c(1,2,3,4,5,6,7,8,9))

	# civilst in interview (9 = no answer)
	data$civilst[data$civilst == 9] <- NA
	data$civilst <- factor(data$civilst,
	labels = c("unmarried", "married",
	"divorced/separated", "widow/widower"),
	levels = c(1,2,3,4))

	# education (0 = not asked)
	data$educ[data$educ == 0] <- NA
	data$educ <- factor(data$educ,
	labels = c("pregymnasial<9", "pregymnasial9-10",
	"gymnasial<2", "gymnasial2-3", "post-gymnasial<3",
	"post-gymnasial>3", "post-graduate"),
	levels = c(1,2,3,4,5,6,7))

	# exercise habits (0 = not asked, 8 = dont know, 9 = no answer)
	# not asked has 4 entries, dont know has 0, no answer has 26
	data$exercise[data$exercise %in% c(0,8,9)] <- NA
	data$exercise <- factor(data$exercise,
	labels = c("practically none", "sometimes" ,"once a week",
	"twice a week", "at least twice"),
	levels = c(1,2,3,4,5))

	# normal hours of work per week
	data$work_norm <- factor(data$work_norm,
	labels = c("1-19hr", "20-34hr", "35-97hr",
	"farmer or self-employed", "other/doesntwork/dontknow"),
	levels = c(1,2,3,4,5))

	# exercise habits (98 = dont know, 99 = no answer)
	data$work_week[data$work_week %in% c(98,99)] <- NA

	# income (999999 = missing)
	data$inc_hh[data$inc_hh == 999999] <- NA
	data$inc_salary[data$inc_salary == 999999] <- NA
	data$inc_cap[data$inc_cap == 999999] <- NA
	data$inc_work[data$inc_work == 999999] <- NA
	data$inc_pens[data$inc_pens == 999999] <- NA

	return(data)
	}