timriffe/FormalDemogADhoc.R

## FormalDemogADhoc.R
# author: tim riffe
# March 24, 2014
# do with this as you will. If you sell it, I'l hunt you down.
# Thanks to Vladimir Canudas Romo for the idea!
# --------------------------------------------

# start

# This package is under development! please test!
# installation is via github. See the README for instructions
#library(devtools)
#install_github("DemogBerkeley", subdir = "DemogBerkeley", username = "UCBdemography")
library(DemogBerkeley)
library(parallel)
library(reshape2)

# define a nifty function to get CAL. it's tricky!!!
getCAL <- function(CTRY, years){
# watch out! HMD cohort data used the word 'Year' for the cohort column
  colnames(CTRY)[colnames(CTRY) == "Year"] <- "Cohort"
# get the year
  CTRY$Year     <- CTRY$Cohort + CTRY$Age
# we need a matric Age by Cohort

# see formula 1 in Vladimir's second linked paper by Guillot (2002)
  ACfem         <- acast(CTRY, Age~Cohort, value.var = "Female")
# not all cohort values are observed, keep track of NAs
  isNA          <- is.na(ACfem)
# we need to take cumulative sum, so zero-out NAs
  ACfem[isNA]   <- 0
# this is the nifty formula
  ACfemCS       <- exp(-apply(ACfem, 2, cumsum))
# put the NAs back in as 0s
  ACfemCS[isNA] <- 0
# only use cohorts observed since birth
  keepers       <- !isNA[1, ]
  ACfemCS       <- ACfemCS[, keepers]
# flip back to AP perspective to grab a year!
  APfemCS       <- AC2AP(ACfemCS)
# grab the desired years. This could be more elegant!
  colSums(APfemCS[, as.character(years)], na.rm = TRUE)
}

# ---------------------------------------------
# Demonstration:
# what are all the HMD Countries?
#CNTRIES <- getHMDcountries()

# let's check out Scandinavia... cuz they have good cohort data
CNTRIES  <- c("FIN","DNK","NOR","SWE","ISL")

# do this live so that your password isn't saved in the script!
user     <- userInput()
password <- userInput()

# grab the needed data item
ALLcMx <- mclapply(CNTRIES, function(CTRY, pw, us){
    try(readHMDweb(CTRY, "cMx_1x1", password = pw, username = us))
  }, pw = password, us = user)

# use demonstration!
Call <- lapply(ALLcMx, getCAL, years = 1950:1970)
Cal <- do.call(cbind, Call)
colnames(Cal) <- CNTRIES

matplot(years, Cal, type = 'l', col = gray(.5), lty= 1)

# EDIT: April 2, 2014
# ---------------------------------------------------
# Vladimir wrote:
# I have a question related to the improvised presentation during the workshop.
# You mentioned that you had calculated the CALs by using the cohort death rates
# cMx_1x1 from HMD instead of period death rates Mx_1x1. I went just now to look
# for those cohort Mx and it seems the HMD only has them if there is a minimum
# of 30 ages or so of information, for example Denmark only has them from 1764
# until 1981 even when information is available until 2011 for the period death
# rates. The death counts are there and I could of course calculate the exposure
# (cExposures_1x1 ) by using the population counts which are also there. But did
# you do that for your presentation? Thank you.
# ---------------------------------------------------
# I responded:
# You're right about the cMx thing. I had 30 min notice and took the easy route,
# only showing CAL for earlier years. I'll update today to make it go to the most
# recent year. Gotta look up the left truncation thing...
# ---------------------------------------------------

# now a bit better--------------
# Need to calculate cohort exposures from population counts and deaths
# classified by Lexis triangles:
# this function is much more involved because it uses lifetable methodology rather than
# assuming continuous stuff. Grabs data from web on the fly and then calculates
# CAL for males and females.

#' @param CTRY HMD country code
#' @param CTRY pw HMD password
#' @param CTRY us HMD uername
#' @param Sex \code{"m"} or \code{"f"}
#' @param N minimum number of years of data. Don't do this for countries with less than \code{N} years of data

#' @return a vetor of CAL values. Names are years.

getCAL <- function(CTRY, pw, us, sex = "m", N = 75){

    stopifnot(sex %in% c("m","f"))
    # an internal function to estimate a0,
    # similar to that proposed by Andreev-Kingkade (2011)
    # don't worry about this- you could use Coale-Demeny rule of thumb instead...
    AKq02a0 <- function(q0, sex = "m"){
      sex <- rep(sex, length(q0))
      ifelse(sex == "m",
        ifelse(q0 < .0226, {0.1493 - 2.0367 * q0},
          ifelse(q0 < 0.0785, {0.0244 + 3.4994 * q0},.2991)),
        ifelse(q0 < 0.0170, {0.1490 - 2.0867 * q0},
          ifelse(q0 < 0.0658, {0.0438 + 4.1075 * q0}, 0.3141))
      )
    }


    Pop       <- readHMDweb(CTRY, "Population", password = pw, username = us)
    Deaths    <- readHMDweb(CTRY, "Deaths_lexis", password = pw, username = us)
    LY        <- max(Deaths$Year)
    FY        <- min(Deaths$Year)
    if (LY - FY < N){
      stop("sorry, not enough years of data to have fun with this!")
    }
    # ---------------------------------------------------
    # notice that lexis deaths open age group cohort is NA.
    # tail(ALLD[[1]]) # notice that open age group cohort is NA.
    # let's just split it 60/40 between a lower and upper triangle
    # it's ad hoc, but then so is this script!
    # ------------------------------------------------
    # this does that:
    Deaths <- do.call(rbind,lapply(split(Deaths, Deaths$Year), function(.DAT){
          # copy last row
          OPEN <- .DAT[nrow(.DAT), ]
          # remove last row
          .DAT  <- .DAT[-nrow(.DAT), ]
          # make two rows:
          OPEN <- rbind(OPEN,OPEN)
          # for selection
          MFT <- c("Male","Female","Total")
          OPEN[1,MFT] <- OPEN[1,MFT] * .6
          OPEN[2,MFT] <- OPEN[2,MFT] * .4
          OPEN$Cohort <- .DAT$Cohort[nrow(.DAT)] - c(0,1)

          rbind(.DAT, OPEN)

        }))
    # lower triangle (LT) is when Year - Cohort - Age == 0
    # upper triangle (UT) is when Year - Cohort - Age == 1
    Deaths$Lexis <- with(Deaths, ifelse(Year - Cohort - Age == 0, "LT",
        ifelse(Year - Cohort - Age == 1, "UT", NA )))

    # need Cohort for reshaping
    Pop$Cohort <- Pop$Year - Pop$Age
    # --------------------------------------------------------
    # Be aware, we are working with the AC shape.
    # This ignores lower triangles in the most recent year.
    # call will be based on a vertical stack of AC parallelograms
    # rather than a perfectly tessalating stack of PC parallelograms...
    # if you prefer the later, you'll need to rethink the following somewhat.
    # --------------------------------------------------------
    # make Age x Cohort matrices:
    if (sex == "m"){
      pop       <- acast(Pop[Pop$Year < LY, ], Age ~ Cohort, value.var = "Male2")
      du        <- acast(Deaths[Deaths$Lexis == "UT" & Deaths$Year > FY, ], Age ~ Cohort, value.var = "Male")
      dl        <- acast(Deaths[Deaths$Lexis == "LT" & Deaths$Year < LY, ], Age ~ Cohort, value.var = "Male")
    }
    if (sex == "f"){
      pop       <- acast(Pop[Pop$Year < LY, ], Age ~ Cohort, value.var = "Female2")
      du        <- acast(Deaths[Deaths$Lexis == "UT" & Deaths$Year > FY, ], Age ~ Cohort, value.var = "Female")
      dl        <- acast(Deaths[Deaths$Lexis == "LT" & Deaths$Year < LY, ], Age ~ Cohort, value.var = "Female")
    }
    # --------------------------------------------------------
    # cohort range not the same: dimension to be equal
    cohs.sclice   <- sort(intersect(intersect(colnames(pop), colnames(dl)), colnames(du)))
    dl        <- dl[, cohs.sclice]
    du        <- du[, cohs.sclice]
    pop       <- pop[, cohs.sclice]

    # ---------------------------------------------------------
    # now we have the 3 items necessary:
    years     <- as.integer(colnames(dl))
    ages      <- as.integer(rownames(dl))

    #
    mx 				<- (dl + du) / (pop + (1 / 3) * (dl - du))
    qx 				<- (dl + du) / (pop + dl)	                              # Eq 70  MPv5
    ax        <- ((mx + 1) * qx - mx) / (mx * qx)                     # Eq 71  MPv5

    ax[1, ]   <- AKq02a0(qx[1,], sex = sex)
    # original formula only holds for continuous hazards. We instead
    # will do some lifetable foo. Only need to get as far as Lx:

    # this is a cheap way to deal with left truncation

    qxtemp <- qx[,1:(nrow(qx)+4)]
    NAind  <- is.na(qxtemp)
    qxtemp <- t(apply(qxtemp, 1, function(qa){
          # qa <- qxtemp[111,]
          if (any(is.na(qa))){
            # arithmetic mean of first 5 non-NA cohort Lx values...
            qa[is.na(qa)] <- mean(qa[!is.na(qa)][1:5], na.rm = TRUE)
          }
          qa
        }))
    qx[,1:(nrow(qx)+4)][NAind] <- qxtemp[NAind]
    #qx[NAind]           <- NA

    ax[is.nan(ax)]  <- .5
    px 				      <- 1 - qx 																				# Eq 64 MPv5
    px[is.nan(px)]  <- 0
    # lx needs to be done columnwise over px, argument 2 refers to the margin.
    lx 				<- apply(px, 2, function(px.){ 					      # Eq 65 MPv5
        c(1, cumprod(px.[-length(px.)]))
      })
    rownames(lx)    <- 0:110 # these got thrown off because l0 imputed.
    lx[is.na(lx)]   <- 0
    dx 				      <- lx * qx 																				# Eq 66 MPv5
    dx[is.na(dx)]   <- 0
    Lx 				      <- lx - (1 - ax) * dx 														# Eq 67 MPv5

    # now transform back to period.
    LxAP      <- AC2AP(Lx)

    # This could be more elegant!

    colSums(LxAP, na.rm = TRUE)
  }

  # demonstration for same countries
  CAL <- do.call(rbind,mclapply(CNTRIES, function(CTRY){
       CALm <- getCAL(CTRY=CTRY,pw=pw,us=us,"m")
       CALf <- getCAL(CTRY=CTRY,pw=pw,us=us,"f")
       years <- as.integer(CALm)
       cbind(CTRY = CTRY, Year = years, Male = CALm, Female = CALf)
    }))
	# author: tim riffe
	# March 24, 2014
	# do with this as you will. If you sell it, I'l hunt you down.
	# Thanks to Vladimir Canudas Romo for the idea!
	# --------------------------------------------

	# start

	# This package is under development! please test!
	# installation is via github. See the README for instructions
	#library(devtools)
	#install_github("DemogBerkeley", subdir = "DemogBerkeley", username = "UCBdemography")
	library(DemogBerkeley)
	library(parallel)
	library(reshape2)

	# define a nifty function to get CAL. it's tricky!!!
	getCAL <- function(CTRY, years){
	# watch out! HMD cohort data used the word 'Year' for the cohort column
	colnames(CTRY)[colnames(CTRY) == "Year"] <- "Cohort"
	# get the year
	CTRY$Year <- CTRY$Cohort + CTRY$Age
	# we need a matric Age by Cohort

	# see formula 1 in Vladimir's second linked paper by Guillot (2002)
	ACfem <- acast(CTRY, Age~Cohort, value.var = "Female")
	# not all cohort values are observed, keep track of NAs
	isNA <- is.na(ACfem)
	# we need to take cumulative sum, so zero-out NAs
	ACfem[isNA] <- 0
	# this is the nifty formula
	ACfemCS <- exp(-apply(ACfem, 2, cumsum))
	# put the NAs back in as 0s
	ACfemCS[isNA] <- 0
	# only use cohorts observed since birth
	keepers <- !isNA[1, ]
	ACfemCS <- ACfemCS[, keepers]
	# flip back to AP perspective to grab a year!
	APfemCS <- AC2AP(ACfemCS)
	# grab the desired years. This could be more elegant!
	colSums(APfemCS[, as.character(years)], na.rm = TRUE)
	}

	# ---------------------------------------------
	# Demonstration:
	# what are all the HMD Countries?
	#CNTRIES <- getHMDcountries()

	# let's check out Scandinavia... cuz they have good cohort data
	CNTRIES <- c("FIN","DNK","NOR","SWE","ISL")

	# do this live so that your password isn't saved in the script!
	user <- userInput()
	password <- userInput()

	# grab the needed data item
	ALLcMx <- mclapply(CNTRIES, function(CTRY, pw, us){
	try(readHMDweb(CTRY, "cMx_1x1", password = pw, username = us))
	}, pw = password, us = user)

	# use demonstration!
	Call <- lapply(ALLcMx, getCAL, years = 1950:1970)
	Cal <- do.call(cbind, Call)
	colnames(Cal) <- CNTRIES

	matplot(years, Cal, type = 'l', col = gray(.5), lty= 1)

	# EDIT: April 2, 2014
	# ---------------------------------------------------
	# Vladimir wrote:
	# I have a question related to the improvised presentation during the workshop.
	# You mentioned that you had calculated the CALs by using the cohort death rates
	# cMx_1x1 from HMD instead of period death rates Mx_1x1. I went just now to look
	# for those cohort Mx and it seems the HMD only has them if there is a minimum
	# of 30 ages or so of information, for example Denmark only has them from 1764
	# until 1981 even when information is available until 2011 for the period death
	# rates. The death counts are there and I could of course calculate the exposure
	# (cExposures_1x1 ) by using the population counts which are also there. But did
	# you do that for your presentation? Thank you.
	# ---------------------------------------------------
	# I responded:
	# You're right about the cMx thing. I had 30 min notice and took the easy route,
	# only showing CAL for earlier years. I'll update today to make it go to the most
	# recent year. Gotta look up the left truncation thing...
	# ---------------------------------------------------

	# now a bit better--------------
	# Need to calculate cohort exposures from population counts and deaths
	# classified by Lexis triangles:
	# this function is much more involved because it uses lifetable methodology rather than
	# assuming continuous stuff. Grabs data from web on the fly and then calculates
	# CAL for males and females.

	#' @param CTRY HMD country code
	#' @param CTRY pw HMD password
	#' @param CTRY us HMD uername
	#' @param Sex \code{"m"} or \code{"f"}
	#' @param N minimum number of years of data. Don't do this for countries with less than \code{N} years of data

	#' @return a vetor of CAL values. Names are years.

	getCAL <- function(CTRY, pw, us, sex = "m", N = 75){

	stopifnot(sex %in% c("m","f"))
	# an internal function to estimate a0,
	# similar to that proposed by Andreev-Kingkade (2011)
	# don't worry about this- you could use Coale-Demeny rule of thumb instead...
	AKq02a0 <- function(q0, sex = "m"){
	sex <- rep(sex, length(q0))
	ifelse(sex == "m",
	ifelse(q0 < .0226, {0.1493 - 2.0367 * q0},
	ifelse(q0 < 0.0785, {0.0244 + 3.4994 * q0},.2991)),
	ifelse(q0 < 0.0170, {0.1490 - 2.0867 * q0},
	ifelse(q0 < 0.0658, {0.0438 + 4.1075 * q0}, 0.3141))
	)
	}


	Pop <- readHMDweb(CTRY, "Population", password = pw, username = us)
	Deaths <- readHMDweb(CTRY, "Deaths_lexis", password = pw, username = us)
	LY <- max(Deaths$Year)
	FY <- min(Deaths$Year)
	if (LY - FY < N){
	stop("sorry, not enough years of data to have fun with this!")
	}
	# ---------------------------------------------------
	# notice that lexis deaths open age group cohort is NA.
	# tail(ALLD[[1]]) # notice that open age group cohort is NA.
	# let's just split it 60/40 between a lower and upper triangle
	# it's ad hoc, but then so is this script!
	# ------------------------------------------------
	# this does that:
	Deaths <- do.call(rbind,lapply(split(Deaths, Deaths$Year), function(.DAT){
	# copy last row
	OPEN <- .DAT[nrow(.DAT), ]
	# remove last row
	.DAT <- .DAT[-nrow(.DAT), ]
	# make two rows:
	OPEN <- rbind(OPEN,OPEN)
	# for selection
	MFT <- c("Male","Female","Total")
	OPEN[1,MFT] <- OPEN[1,MFT] * .6
	OPEN[2,MFT] <- OPEN[2,MFT] * .4
	OPEN$Cohort <- .DAT$Cohort[nrow(.DAT)] - c(0,1)

	rbind(.DAT, OPEN)

	}))
	# lower triangle (LT) is when Year - Cohort - Age == 0
	# upper triangle (UT) is when Year - Cohort - Age == 1
	Deaths$Lexis <- with(Deaths, ifelse(Year - Cohort - Age == 0, "LT",
	ifelse(Year - Cohort - Age == 1, "UT", NA )))

	# need Cohort for reshaping
	Pop$Cohort <- Pop$Year - Pop$Age
	# --------------------------------------------------------
	# Be aware, we are working with the AC shape.
	# This ignores lower triangles in the most recent year.
	# call will be based on a vertical stack of AC parallelograms
	# rather than a perfectly tessalating stack of PC parallelograms...
	# if you prefer the later, you'll need to rethink the following somewhat.
	# --------------------------------------------------------
	# make Age x Cohort matrices:
	if (sex == "m"){
	pop <- acast(Pop[Pop$Year < LY, ], Age ~ Cohort, value.var = "Male2")
	du <- acast(Deaths[Deaths$Lexis == "UT" & Deaths$Year > FY, ], Age ~ Cohort, value.var = "Male")
	dl <- acast(Deaths[Deaths$Lexis == "LT" & Deaths$Year < LY, ], Age ~ Cohort, value.var = "Male")
	}
	if (sex == "f"){
	pop <- acast(Pop[Pop$Year < LY, ], Age ~ Cohort, value.var = "Female2")
	du <- acast(Deaths[Deaths$Lexis == "UT" & Deaths$Year > FY, ], Age ~ Cohort, value.var = "Female")
	dl <- acast(Deaths[Deaths$Lexis == "LT" & Deaths$Year < LY, ], Age ~ Cohort, value.var = "Female")
	}
	# --------------------------------------------------------
	# cohort range not the same: dimension to be equal
	cohs.sclice <- sort(intersect(intersect(colnames(pop), colnames(dl)), colnames(du)))
	dl <- dl[, cohs.sclice]
	du <- du[, cohs.sclice]
	pop <- pop[, cohs.sclice]

	# ---------------------------------------------------------
	# now we have the 3 items necessary:
	years <- as.integer(colnames(dl))
	ages <- as.integer(rownames(dl))

	#
	mx <- (dl + du) / (pop + (1 / 3) * (dl - du))
	qx <- (dl + du) / (pop + dl) # Eq 70 MPv5
	ax <- ((mx + 1) * qx - mx) / (mx * qx) # Eq 71 MPv5

	ax[1, ] <- AKq02a0(qx[1,], sex = sex)
	# original formula only holds for continuous hazards. We instead
	# will do some lifetable foo. Only need to get as far as Lx:

	# this is a cheap way to deal with left truncation

	qxtemp <- qx[,1:(nrow(qx)+4)]
	NAind <- is.na(qxtemp)
	qxtemp <- t(apply(qxtemp, 1, function(qa){
	# qa <- qxtemp[111,]
	if (any(is.na(qa))){
	# arithmetic mean of first 5 non-NA cohort Lx values...
	qa[is.na(qa)] <- mean(qa[!is.na(qa)][1:5], na.rm = TRUE)
	}
	qa
	}))
	qx[,1:(nrow(qx)+4)][NAind] <- qxtemp[NAind]
	#qx[NAind] <- NA

	ax[is.nan(ax)] <- .5
	px <- 1 - qx # Eq 64 MPv5
	px[is.nan(px)] <- 0
	# lx needs to be done columnwise over px, argument 2 refers to the margin.
	lx <- apply(px, 2, function(px.){ # Eq 65 MPv5
	c(1, cumprod(px.[-length(px.)]))
	})
	rownames(lx) <- 0:110 # these got thrown off because l0 imputed.
	lx[is.na(lx)] <- 0
	dx <- lx * qx # Eq 66 MPv5
	dx[is.na(dx)] <- 0
	Lx <- lx - (1 - ax) * dx # Eq 67 MPv5

	# now transform back to period.
	LxAP <- AC2AP(Lx)

	# This could be more elegant!

	colSums(LxAP, na.rm = TRUE)
	}

	# demonstration for same countries
	CAL <- do.call(rbind,mclapply(CNTRIES, function(CTRY){
	CALm <- getCAL(CTRY=CTRY,pw=pw,us=us,"m")
	CALf <- getCAL(CTRY=CTRY,pw=pw,us=us,"f")
	years <- as.integer(CALm)
	cbind(CTRY = CTRY, Year = years, Male = CALm, Female = CALf)
	}))